In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from ast import literal_eval
from itertools import combinations
from collections import Counter
import re

PROJECT_ROOT = Path(r"C:\Users\Parth Arora\OneDrive\Desktop\CyberShield")   
CSV_PATH    = PROJECT_ROOT / "anti_india_dataset_with_orgs.csv"

df = pd.read_csv(CSV_PATH)  # only sheet
print(df.shape, df.columns.tolist())


(600, 17) ['platform', 'username', 'user_id', 'profile_link', 'content_text', 'language', 'hashtags', 'mentions', 'urls', 'timestamp', 'likes', 'shares', 'comments_count', 'media_type', 'media_url', 'ocr_text', 'Unnamed: 16']


In [4]:
def norm_tag(tag: str) -> str:
    if not isinstance(tag, str):
        return ""
    t = tag.strip()
    if not t:
        return ""
    # keep `#`, lowercase, collapse whitespace
    t = re.sub(r"\s+", "", t.lower())
    # ensure it starts with '#'
    if not t.startswith("#"):
        t = "#" + t
    return t

def parse_hashtag_cell(cell):
    """
    Your column often looks like "['#Boycott', '#AltMed']" (string repr).
    Safely parse → list[str]. If it's already a list, return as-is.
    """
    if isinstance(cell, list):
        return cell
    if isinstance(cell, str) and cell.strip():
        try:
            val = literal_eval(cell)
            if isinstance(val, list):
                return val
        except Exception:
            # fallback: split by commas/spaces
            return re.findall(r"#\w+", cell)
    return []

# Make a clean list-of-tags column
if "hashtags" not in df.columns:
    df["hashtags"] = ""

df["hashtags_list"] = df["hashtags"].apply(parse_hashtag_cell)
df["hashtags_list"] = df["hashtags_list"].apply(lambda lst: sorted({norm_tag(x) for x in lst if norm_tag(x)}))
df["hashtags_list"].head(5)


0    [#freekashmir, #standwithxyz]
1       [#boycottindia, #indiaout]
2    [#freekashmir, #standwithxyz]
3     [#banbollywood, #justicenow]
4       [#boycottindia, #indiaout]
Name: hashtags_list, dtype: object

In [5]:
tags_long = (df.loc[df["hashtags_list"].str.len() > 0, ["hashtags_list", "timestamp"]]
               .explode("hashtags_list")
               .rename(columns={"hashtags_list":"hashtag"})
               .reset_index(drop=True))

tag_freq = (tags_long["hashtag"]
            .value_counts()
            .reset_index()
            .rename(columns={"index":"hashtag", "hashtag":"count"}))

tag_freq.head(15)


Unnamed: 0,count,count.1
0,#freekashmir,129
1,#standwithxyz,129
2,#humanrights,123
3,#resistindia,123
4,#boycottindia,122
5,#indiaout,122
6,#justicenow,116
7,#banbollywood,116
8,#stopindia,110
9,#oppression,110


In [6]:
def post_pairs(tags):
    # unique combinations from a post-level tag list
    tags = sorted(set(tags))
    return list(combinations(tags, 2))

pairs = []
for tags in df["hashtags_list"]:
    if isinstance(tags, list) and len(tags) >= 2:
        pairs.extend(post_pairs(tags))

pair_counts = Counter(pairs)
cooc_df = (pd.DataFrame([(a,b,c) for (a,b), c in pair_counts.items()],
                        columns=["tag1","tag2","cooc"]))
cooc_df = cooc_df.sort_values("cooc", ascending=False).reset_index(drop=True)
cooc_df.head(15)


Unnamed: 0,tag1,tag2,cooc
0,#freekashmir,#standwithxyz,129
1,#humanrights,#resistindia,123
2,#boycottindia,#indiaout,122
3,#banbollywood,#justicenow,116
4,#oppression,#stopindia,110


In [7]:
seed = "#boycottindia"  # <-- set your main tag (lowercase)

# normalize seed the same way:
seed = norm_tag(seed)

seed_rows = cooc_df[(cooc_df["tag1"]==seed) | (cooc_df["tag2"]==seed)].copy()
seed_rows["other"] = np.where(seed_rows["tag1"]==seed, seed_rows["tag2"], seed_rows["tag1"])
seed_top = seed_rows[["other","cooc"]].sort_values("cooc", ascending=False).reset_index(drop=True)
seed_top.head(25)


Unnamed: 0,other,cooc
0,#indiaout,122


In [8]:
# Marginal frequencies for each tag
freq_map = dict(tag_freq.set_index("hashtag")["count"])
N_posts_with_any_tags = int((df["hashtags_list"].str.len() > 0).sum())

def jaccard(a, b):
    # J(A,B) = cooc(A,B) / (freq(A) + freq(B) - cooc(A,B))
    c = cooc  # placeholder inside loop
    return c / max(1, (freq_map.get(a,0) + freq_map.get(b,0) - c))

def pmi(a, b):
    # PMI = log2( p(a,b) / (p(a)*p(b)) )
    # approximate p(a) as freq(a)/N, p(a,b) as cooc/N
    fa, fb = freq_map.get(a,0), freq_map.get(b,0)
    if fa==0 or fb==0 or cooc==0:
        return float("-inf")
    pab = cooc / N_posts_with_any_tags
    pa  = fa   / N_posts_with_any_tags
    pb  = fb   / N_posts_with_any_tags
    return np.log2(pab / (pa*pb)) if pa>0 and pb>0 else float("-inf")

# Add metrics
rows = []
for _, r in cooc_df.iterrows():
    a, b, cooc = r["tag1"], r["tag2"], int(r["cooc"])
    rows.append({
        "tag1": a, "tag2": b, "cooc": cooc,
        "jaccard": jaccard(a,b),
        "pmi": pmi(a,b)
    })

assoc_df = pd.DataFrame(rows).sort_values(["pmi","cooc"], ascending=[False, False]).reset_index(drop=True)
assoc_df.head(15)


KeyError: "None of ['hashtag'] are in the columns"

In [12]:
# --- Patch NumPy 2.0 type aliases that NetworkX writers still reference ---
import numpy as np
if not hasattr(np, "float_"): np.float_ = np.float64
if not hasattr(np, "int_"):   np.int_   = np.int64
if not hasattr(np, "bool_"):  np.bool_  = bool  # safety

# --- Sanitize graph attributes to pure Python types (no numpy scalars) ---
def _py(v):
    import numpy as _np
    from numbers import Number
    if isinstance(v, (_np.integer,)):   return int(v)
    if isinstance(v, (_np.floating,)):  return float(v)
    if isinstance(v, (bytes, bytearray)): return v.decode("utf-8", "ignore")
    # Booleans and strings are already fine; lists/dicts with numpy items are uncommon here
    return v

def sanitize_graph_attrs(G):
    for n, data in G.nodes(data=True):
        for k, v in list(data.items()):
            data[k] = _py(v)
    for u, v, data in G.edges(data=True):
        for k, val in list(data.items()):
            data[k] = _py(val)

sanitize_graph_attrs(G)

# --- Try writing GEXF and GraphML; if either fails, still produce CSVs ---
from pathlib import Path
import pandas as pd
import networkx as nx

out_base = PROJECT_ROOT / "hashtag_network_cooc"
gexf_path    = out_base.with_suffix(".gexf")
graphml_path = out_base.with_suffix(".graphml")

# 1) GEXF
try:
    nx.write_gexf(G, gexf_path)
    print("✅ Wrote GEXF:", gexf_path)
except Exception as e:
    print("⚠️ GEXF failed:", e)

# 2) GraphML
try:
    nx.write_graphml(G, graphml_path)
    print("✅ Wrote GraphML:", graphml_path)
except Exception as e:
    print("⚠️ GraphML failed:", e)

# 3) Always write CSV node/edge tables (Gephi can import these too)
nodes_csv = PROJECT_ROOT / "hashtag_network_nodes.csv"
edges_csv = PROJECT_ROOT / "hashtag_network_edges.csv"

nodes_df = pd.DataFrame([{"id": n, **(G.nodes[n] or {})} for n in G.nodes()])
edges_df = pd.DataFrame([{"source": u, "target": v, **(d or {})} for u, v, d in G.edges(data=True)])

nodes_df.to_csv(nodes_csv, index=False, encoding="utf-8")
edges_df.to_csv(edges_csv, index=False, encoding="utf-8")
print("✅ Wrote CSVs:\n -", nodes_csv, "\n -", edges_csv)


✅ Wrote GEXF: C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\hashtag_network_cooc.gexf
✅ Wrote GraphML: C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\hashtag_network_cooc.graphml
✅ Wrote CSVs:
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\hashtag_network_nodes.csv 
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\hashtag_network_edges.csv
