In [2]:
from pathlib import Path
import sys, subprocess, time
import pandas as pd

PROJECT_ROOT = Path(r"C:\Users\Parth Arora\OneDrive\Desktop\CyberShield")
SHERLOCK_DIR = PROJECT_ROOT / "sherlock"

print("Sherlock repo:", SHERLOCK_DIR, "exists?", SHERLOCK_DIR.exists())


Sherlock repo: C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\sherlock exists? True


In [5]:
def run_sherlock(username, timeout=90, site_filter=None):
    cmd = [sys.executable, "-m", "sherlock_project",
           "--print-found", "--no-color", "--timeout", "30"]
    if site_filter:
        for s in site_filter:
            cmd += ["--site", s]
    cmd.append(username)

    try:
        proc = subprocess.run(
            cmd, cwd=str(SHERLOCK_DIR),
            capture_output=True, text=True, timeout=timeout
        )
    except subprocess.TimeoutExpired:
        return []

    out = (proc.stdout or "").strip()
    results = []
    for line in out.splitlines():
        if line.startswith("[+]"):
            try:
                _, rest = line.split("] ", 1)
                site, url = rest.split(":", 1)
                results.append({
                    "username": username,
                    "site": site.strip(),   
                    "status": "FOUND",
                    "url": url.strip()
                })
            except Exception:
                continue
    return results


In [6]:
def username_variants(base):
    base_clean = base.replace(" ", "")
    return sorted({
        base_clean,
        base_clean.lower(),
        base_clean.capitalize(),
        base_clean.replace(".", "_"),
        base_clean.replace("_", "."),
        base_clean + "1",
        base_clean + "123",
        base_clean + "_official",
        "real" + base_clean,
        base_clean + "_",
    })


In [7]:
def link_usernames(usernames, use_variants=True, site_filter=None, sleep_between=1.0):
    rows = []
    for uname in usernames:
        candidates = username_variants(uname) if use_variants else [uname]
        for cand in candidates:
            res = run_sherlock(cand, site_filter=site_filter)
            rows.extend(res)
            time.sleep(sleep_between)  # avoid hammering sites
    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["username", "site"]).reset_index(drop=True)
    return df


In [15]:
usernames = [
    "freedom123",
    "truth_warrior",
    "john_doe",
    "cyberwatcher",
    "globalvoice",
    "justice_now",
    "tech_sentinel",
    "jane_doe"     # add 1–2 famous handles for fun
]


# Restrict to a subset for speed
site_filter = [
    "Twitter", "Instagram", "Reddit", "YouTube", "GitHub",
    "TikTok", "Telegram", "Facebook", "Medium", "Quora",
    "Buffer", "Pinterest", "Tumblr", "Flickr", "SoundCloud",
    "Vimeo", "VK", "Goodreads", "Snapchat", "WordPress"
]


df_links = link_usernames(
    usernames,
    use_variants=True,     # user variations
    site_filter=site_filter,
    sleep_between=0.5       # small delay so sites don’t block requests
)

print("Total results:", len(df_links))
df_links.head(20)



Total results: 116


Unnamed: 0,username,site,status,url
0,Cyberwatcher,GitHub,FOUND,https://www.github.com/Cyberwatcher
1,Cyberwatcher,Reddit,FOUND,https://www.reddit.com/user/Cyberwatcher
2,Cyberwatcher,WordPress,FOUND,https://Cyberwatcher.wordpress.com/
3,Cyberwatcher,YouTube,FOUND,https://www.youtube.com/@Cyberwatcher
4,Freedom123,Flickr,FOUND,https://www.flickr.com/people/Freedom123
5,Freedom123,GitHub,FOUND,https://www.github.com/Freedom123
6,Freedom123,Reddit,FOUND,https://www.reddit.com/user/Freedom123
7,Freedom123,SoundCloud,FOUND,https://soundcloud.com/Freedom123
8,Freedom123,Telegram,FOUND,https://t.me/Freedom123
9,Freedom123,WordPress,FOUND,https://Freedom123.wordpress.com/


In [17]:
out_all = PROJECT_ROOT / "username_linkage_all.csv"
out_hits = PROJECT_ROOT / "username_linkage_hits.csv"

hits = df_links[df_links["status"] == "FOUND"].copy()
df_links.to_csv(out_all, index=False, encoding="utf-8")
hits.to_csv(out_hits, index=False, encoding="utf-8")

print("Saved:\n -", out_all, "\n -", out_hits)


Saved:
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\username_linkage_all.csv 
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\username_linkage_hits.csv


In [18]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path(r"C:\Users\Parth Arora\OneDrive\Desktop\CyberShield")

# keep only FOUND rows (if status exists)
if "status" in df_links.columns:
    links = df_links[df_links["status"].eq("FOUND")].copy()
else:
    links = df_links.copy()

# ---- Nodes ----
# users
users = pd.DataFrame({
    "id": links["username"].astype(str).unique(),
    "label": links["username"].astype(str).unique(),
    "type": "user"
})

# sites
sites = pd.DataFrame({
    "id": links["site"].astype(str).unique(),
    "label": links["site"].astype(str).unique(),
    "type": "site"
})

nodes = pd.concat([users, sites], ignore_index=True)

# ---- Edges ----
edges = links[["username","site","url"]].copy()
edges = edges.rename(columns={"username":"source","site":"target"})
edges["relation"] = "presence"
edges["weight"] = 1  # constant weight for presence edges
edges = edges[["source","target","relation","weight","url"]]

# ---- Save ----
nodes_path = PROJECT_ROOT / "sherlock_nodes.csv"
edges_path = PROJECT_ROOT / "sherlock_edges.csv"

nodes.to_csv(nodes_path, index=False, encoding="utf-8")
edges.to_csv(edges_path, index=False, encoding="utf-8")

print("Saved:")
print(" -", nodes_path)
print(" -", edges_path)


Saved:
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\sherlock_nodes.csv
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\sherlock_edges.csv



user-user projection for community detection

In [19]:
import pandas as pd
import itertools

# Load your Sherlock results (df_links should have: username, site, status, url)
df_links = pd.read_csv("username_linkage_all.csv")# replace with your file

# Keep only FOUND results
df_links = df_links[df_links["status"] == "FOUND"]

# Build User-User edges: two users are connected if they share the same site
edges = []
for site, group in df_links.groupby("site"):
    users = group["username"].unique()
    # make all pairs of users from this site
    for u1, u2 in itertools.combinations(users, 2):
        edges.append((u1, u2, site))

# Convert to DataFrame
df_edges = pd.DataFrame(edges, columns=["source", "target", "site"])

# Optional: collapse duplicates by counting co-occurrences
df_edges = df_edges.groupby(["source", "target"]).size().reset_index(name="weight")

# Save for Gephi
df_edges.to_csv("user_user_edges.csv", index=False)
print("✅ user_user_edges.csv written with", len(df_edges), "edges")


✅ user_user_edges.csv written with 385 edges
