In [1]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(r"C:\Users\Parth Arora\OneDrive\Desktop\CyberShield")
CSV_PATH    = PROJECT_ROOT / "anti_india_dataset_with_orgs.csv"  # change if needed

df = pd.read_csv(CSV_PATH)   # loads the only sheet automatically
print("Loaded:", df.shape)
print("Columns:", df.columns.tolist())
df.head(3)


Loaded: (600, 17)
Columns: ['platform', 'username', 'user_id', 'profile_link', 'content_text', 'language', 'hashtags', 'mentions', 'urls', 'timestamp', 'likes', 'shares', 'comments_count', 'media_type', 'media_url', 'ocr_text', 'Unnamed: 16']


Unnamed: 0,platform,username,user_id,profile_link,content_text,language,hashtags,mentions,urls,timestamp,likes,shares,comments_count,media_type,media_url,ocr_text,Unnamed: 16
0,telegram,mitchellbaker,374755017,https://telegram.com/mitchellbaker,Cost city north mind security final here signi...,hi,"['#FreeKashmir', '#StandWithXYZ']",['@altmedia'],['https://howard.com/'],2025-07-13T11:12:48Z,600,289,190,text,https://dummyimage.com/898x267,step change rest future India answer human,
1,instagram,jonathanjohnson,109873839,https://instagram.com/jonathanjohnson,Provide shake base arrive risk raise home glas...,en,"['#BoycottIndia', '#IndiaOut']",['@intlwatch'],['https://www.moran.org/'],2025-07-09T07:26:40Z,808,263,184,video,https://placeimg.com/26/326/any,notice benefit arm necessary India person into...,
2,twitter,newtonjane,866533357,https://twitter.com/newtonjane,If agreement full market movement minute espec...,ur,"['#FreeKashmir', '#StandWithXYZ']",['@freevoice'],['http://www.underwood.org/'],2025-07-27T12:01:18Z,684,329,200,image,https://placeimg.com/908/818/any,play action shake term according course tough ...,


In [2]:
import re
from ast import literal_eval

def norm_tag(tag: str) -> str:
    if not isinstance(tag, str):
        return ""
    t = tag.strip().lower()
    if not t:
        return ""
    t = re.sub(r"\s+", "", t)  # remove spaces inside tags
    if not t.startswith("#"):
        t = "#" + t
    return t

def parse_hashtag_cell(cell):
    """
    Accepts:
      - a Python-list-looking string: "['#Tag1', '#Tag2']"
      - a real list: ['#Tag1', '#Tag2']
      - a free text string with #tags mixed in
    Returns: list[str]
    """
    if isinstance(cell, list):
        raw = cell
    elif isinstance(cell, str) and cell.strip():
        # Try safe literal eval first
        try:
            val = literal_eval(cell)
            if isinstance(val, list):
                raw = val
            else:
                # fallback: regex extract words starting with '#'
                raw = re.findall(r"#\w+", cell)
        except Exception:
            raw = re.findall(r"#\w+", cell)
    else:
        raw = []
    # normalize + unique + sorted
    clean = sorted({norm_tag(x) for x in raw if norm_tag(x)})
    return clean

if "hashtags" not in df.columns:
    df["hashtags"] = ""

df["hashtags_list"] = df["hashtags"].apply(parse_hashtag_cell)
print("Example rows with hashtags:")
df.loc[df["hashtags_list"].str.len() > 0, ["hashtags", "hashtags_list"]].head(5)


Example rows with hashtags:


Unnamed: 0,hashtags,hashtags_list
0,"['#FreeKashmir', '#StandWithXYZ']","[#freekashmir, #standwithxyz]"
1,"['#BoycottIndia', '#IndiaOut']","[#boycottindia, #indiaout]"
2,"['#FreeKashmir', '#StandWithXYZ']","[#freekashmir, #standwithxyz]"
3,"['#BanBollywood', '#JusticeNow']","[#banbollywood, #justicenow]"
4,"['#BoycottIndia', '#IndiaOut']","[#boycottindia, #indiaout]"


In [3]:
    # Explode: one hashtag per row
tags_long = (
    df.loc[df["hashtags_list"].str.len() > 0, ["hashtags_list"]]
      .explode("hashtags_list")
      .rename(columns={"hashtags_list": "hashtag"})
      .reset_index(drop=True)
)

# Canonical frequency table with exact columns: 'hashtag', 'count'
tag_freq = (
    tags_long["hashtag"]
    .value_counts(dropna=True)
    .rename_axis("hashtag")
    .reset_index(name="count")
)

print("tags_long:", tags_long.shape, "   tag_freq:", tag_freq.shape)
tag_freq.head(10)


tags_long: (1200, 1)    tag_freq: (10, 2)


Unnamed: 0,hashtag,count
0,#freekashmir,129
1,#standwithxyz,129
2,#humanrights,123
3,#resistindia,123
4,#boycottindia,122
5,#indiaout,122
6,#justicenow,116
7,#banbollywood,116
8,#stopindia,110
9,#oppression,110


In [4]:
from itertools import combinations
from collections import Counter

def post_pairs(tags):
    # unique unordered pairs
    tags = sorted(set(tags))
    return list(combinations(tags, 2))

pairs = []
for tags in df["hashtags_list"]:
    if isinstance(tags, list) and len(tags) >= 2:
        pairs.extend(post_pairs(tags))

pair_counts = Counter(pairs)

cooc_df = (
    pd.DataFrame([(a, b, c) for (a, b), c in pair_counts.items()],
                 columns=["tag1", "tag2", "cooc"])
    .sort_values("cooc", ascending=False)
    .reset_index(drop=True)
)

print("cooc_df:", cooc_df.shape)
cooc_df.head(10)


cooc_df: (5, 3)


Unnamed: 0,tag1,tag2,cooc
0,#freekashmir,#standwithxyz,129
1,#humanrights,#resistindia,123
2,#boycottindia,#indiaout,122
3,#banbollywood,#justicenow,116
4,#oppression,#stopindia,110


In [5]:
import numpy as np

# Marginal frequencies lookup
freq_map = dict(tag_freq.set_index("hashtag")["count"])

# Number of posts that had at least one hashtag (for PMI denominator)
N_posts_with_any_tags = int((df["hashtags_list"].str.len() > 0).sum())

def jaccard(a, b, cooc, fa, fb):
    # J(A,B) = cooc / (fa + fb - cooc)
    denom = (fa + fb - cooc)
    return (cooc / denom) if denom > 0 else 0.0

def pmi(a, b, cooc, fa, fb):
    # PMI = log2( p(a,b) / (p(a)*p(b)) )
    if N_posts_with_any_tags == 0 or fa == 0 or fb == 0 or cooc == 0:
        return float("-inf")
    pab = cooc / N_posts_with_any_tags
    pa  = fa   / N_posts_with_any_tags
    pb  = fb   / N_posts_with_any_tags
    denom = pa * pb
    if denom <= 0:
        return float("-inf")
    return float(np.log2(pab / denom))

rows = []
for _, r in cooc_df.iterrows():
    a, b, cooc = r["tag1"], r["tag2"], int(r["cooc"])
    fa = int(freq_map.get(a, 0))
    fb = int(freq_map.get(b, 0))
    rows.append({
        "tag1": a, "tag2": b, "cooc": cooc,
        "freq_tag1": fa, "freq_tag2": fb,
        "jaccard": jaccard(a, b, cooc, fa, fb),
        "pmi": pmi(a, b, cooc, fa, fb)
    })

assoc_df = (
    pd.DataFrame(rows)
    .sort_values(["pmi", "cooc"], ascending=[False, False])
    .reset_index(drop=True)
)

print("assoc_df:", assoc_df.shape)
assoc_df.head(10)


assoc_df: (5, 7)


Unnamed: 0,tag1,tag2,cooc,freq_tag1,freq_tag2,jaccard,pmi
0,#oppression,#stopindia,110,110,110,1.0,2.447459
1,#banbollywood,#justicenow,116,116,116,1.0,2.370838
2,#boycottindia,#indiaout,122,122,122,1.0,2.298081
3,#humanrights,#resistindia,123,123,123,1.0,2.286304
4,#freekashmir,#standwithxyz,129,129,129,1.0,2.217591


In [6]:
def top_cooccurring_with(seed_tag, cooc_df, topk=25):
    seed = norm_tag(seed_tag)
    if seed not in set(tag_freq["hashtag"]):
        print(f"Seed '{seed}' not found in tag_freq.")
    subset = cooc_df[(cooc_df["tag1"]==seed) | (cooc_df["tag2"]==seed)].copy()
    if subset.empty:
        return subset
    subset["other"] = np.where(subset["tag1"]==seed, subset["tag2"], subset["tag1"])
    return subset[["other","cooc"]].sort_values("cooc", ascending=False).head(topk)

seed_top = top_cooccurring_with("#boycottindia", cooc_df, topk=25)
seed_top


Unnamed: 0,other,cooc
2,#indiaout,122


In [7]:
out_dir = PROJECT_ROOT
paths = {
    "hashtag_frequency.csv": tag_freq,
    "hashtag_cooccurrence_counts.csv": cooc_df,
    "hashtag_association_pmi_jaccard.csv": assoc_df,
}
for name, data in paths.items():
    data.to_csv(out_dir / name, index=False, encoding="utf-8")
    print("Saved:", out_dir / name)


Saved: C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\hashtag_frequency.csv
Saved: C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\hashtag_cooccurrence_counts.csv
Saved: C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\hashtag_association_pmi_jaccard.csv
