In [54]:
import json

addresses = {}

# Load the JSON file
with open('../output/addresses.json', 'r', encoding='utf-8') as file:
    addresses = json.load(file)

In [55]:
# filter out null or None values
addresses = {k: v for k, v in addresses.items() if v is not None and v != 'null' and v != ''}

romania_addresses = {}
other_addresses = {}
# extract addresses with 'Romania' in the value. 
for account, value in addresses.items():
    # Replace all special characters (ăââßîșț) with their ASCII equivalents
    ascii_value = value.encode('ascii', 'ignore').decode('ascii')
    if 'Romania' in ascii_value:
        romania_addresses[account] = ascii_value
    else:
        other_addresses[account] = ascii_value
        

# Print the rest
print("Addresses without 'Romania':")
# for account, value in other_addresses.items():
    #print(f"{account}: {value}")

Addresses without 'Romania':


In [56]:
import hashlib
import re
import unicodedata


def hash_text(text):
    """
    Hash the comment text to create a unique identifier.
    """
    # Normalize to NFKD form and encode to ASCII, ignoring non-ASCII characters
    normalized_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

    # Remove all characters except a-z and 0-9
    cleaned_text = re.sub(r'[^a-z0-9]', '', normalized_text.lower())
    # Hash the cleaned text using SHA-256
    return hashlib.sha256(cleaned_text.encode('utf-8')).hexdigest().upper()

# Create a dictionary to store the hashes and their corresponding addresses
hashes = {}
for account, value in other_addresses.items():
    # Hash the address value
    address_hash = hash_text(value)
    # Store the hash and the address in the dictionary
    if address_hash not in hashes:
        hashes[address_hash] = {
            'address': value,
            'count': 1,
            'accounts': [account]
        }
    else:
        hashes[address_hash]['count'] += 1
        hashes[address_hash]['accounts'].append(account)

# save the hashes to a JSON file, ordered by count
with open('../output/address_summary.json', 'w', encoding='utf-8') as file:
    # Sort the hashes by count in descending order
    sorted_hashes = dict(sorted(hashes.items(), key=lambda item: item[1]['count'], reverse=True))
    # Write the sorted hashes to the JSON file
    json.dump(sorted_hashes, file, ensure_ascii=False, indent=4)


In [None]:
# 📦 Asigură-te că ai rapidfuzz instalat
# !pip install rapidfuzz

from rapidfuzz import fuzz
from collections import defaultdict

# 📌 Exemplu de structură: {"url1": "Adresa 1", "url2": "Adresa 2", ...}
# other_addresses = {...}
# 📦 Dacă nu ai rapidfuzz instalat:
# !pip install rapidfuzz

import json
from rapidfuzz import fuzz
from collections import defaultdict
from pathlib import Path

# Încarcă datele
with open("../output/augmented_accounts.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Transform url in other_addresses (the key is the url) and remove the `about` part. (either \about either &sk=about)
def normalize_facebook_url(url: str) -> str:
    if "/about" in url:
        return url.split("/about")[0]
    if "?sk=about" in url:
        return url.split("&sk=about")[0]
    return url.rstrip("/")

normalized_other_addresses = {}
for url, address in other_addresses.items():
    clean_url = normalize_facebook_url(url)
    normalized_other_addresses[clean_url] = address  # may overwrite duplicates

# Copy back to the original dictionary
other_addresses = normalized_other_addresses
   
# Extrage mapping-ul URL → adresă și URL → nume
url_to_name = {}

for account in data["accounts"]:
    url = account["profile_url"]
    name = account["name"]
    address = account.get("address", "").strip()
    url_to_name[url] = name

# Normalizează adresele pentru comparare fuzzy
normalized = {url: addr.lower().replace(",", "").strip() for url, addr in other_addresses.items()}

urls = list(normalized.keys())
visited = set()
groups = defaultdict(list)
SIMILARITY_THRESHOLD = 80

# Grupează adresele similare
for i, url_i in enumerate(urls):
    if url_i in visited:
        continue
    group = [url_i]
    visited.add(url_i)
    addr_i = normalized[url_i]
    for j in range(i + 1, len(urls)):
        url_j = urls[j]
        if url_j in visited:
            continue
        addr_j = normalized[url_j]
        similarity = fuzz.ratio(addr_i, addr_j)
        if similarity >= SIMILARITY_THRESHOLD:
            group.append(url_j)
            visited.add(url_j)
    groups[url_i] = group

# Sortează grupurile descrescător, după numărul de conturi
sorted_groups = sorted(groups.items(), key=lambda item: len(item[1]), reverse=True)

# Generează conținutul raportului în limba română
report_lines = ["# 📍 Gruparea conturilor după adresă (top)\n"]

for leader_url, group_urls in sorted_groups:
    address = other_addresses[leader_url]
    report_lines.append(f"\n## {address}  \n**{len(group_urls)} conturi asociate**:\n")
    for url in group_urls:
        name = url_to_name.get(url, "Nume necunoscut")
        report_lines.append(f"- [{name}]({url})")

# Salvează raportul în fișierul Markdown
report_path = Path("../reports/report-top-other-addresses.md")
report_path.parent.mkdir(parents=True, exist_ok=True)
report_path.write_text("\n".join(report_lines), encoding="utf-8")

# Normalizează adresele (litere mici, fără virgule, fără spații inutile)
normalized = {url: addr.lower().replace(",", "").strip() for url, addr in other_addresses.items()}

urls = list(normalized.keys())
vizitate = set()
grupuri = defaultdict(list)

PRAG_SIMILARITATE = 90



# Compară fiecare adresă cu celelalte
for i, url_i in enumerate(urls):
    if url_i in vizitate:
        continue
    grup = [url_i]
    vizitate.add(url_i)
    adresa_i = normalized[url_i]
    for j in range(i + 1, len(urls)):
        url_j = urls[j]
        if url_j in vizitate:
            continue
        adresa_j = normalized[url_j]
        similaritate = fuzz.ratio(adresa_i, adresa_j)
        if similaritate >= PRAG_SIMILARITATE:
            grup.append(url_j)
            vizitate.add(url_j)
    grupuri[url_i] = grup  # folosim primul URL ca reprezentant al grupului

# 🔽 Afișează rezultatul în format Markdown
for lider_grup, grup_url in grupuri.items():
    print(f"\n### 🔗 Grup cu reprezentantul: [{lider_grup}]({lider_grup})\n")
    for url in grup_url:
        print(f"- [{url}]({url}) → `{other_addresses[url]}`")



### 🔗 Grup cu reprezentantul: [https://www.facebook.com/profile.php?id=100083590601710&sk=about](https://www.facebook.com/profile.php?id=100083590601710&sk=about)

- [https://www.facebook.com/profile.php?id=100083590601710&sk=about](https://www.facebook.com/profile.php?id=100083590601710&sk=about) → `New York, NY, United States`

### 🔗 Grup cu reprezentantul: [https://www.facebook.com/italia.sosro.ro](https://www.facebook.com/italia.sosro.ro)

- [https://www.facebook.com/italia.sosro.ro](https://www.facebook.com/italia.sosro.ro) → `Rome, Italy`

### 🔗 Grup cu reprezentantul: [https://www.facebook.com/arges.sosro.ro](https://www.facebook.com/arges.sosro.ro)

- [https://www.facebook.com/arges.sosro.ro](https://www.facebook.com/arges.sosro.ro) → `B-dul Republicii, nr. 148, etaj1, fosta cladire Electrica`

### 🔗 Grup cu reprezentantul: [https://www.facebook.com/dolj.sosro.ro](https://www.facebook.com/dolj.sosro.ro)

- [https://www.facebook.com/dolj.sosro.ro](https://www.facebook.com/dolj.