In [3]:
# 1) Setup & load Excel
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(r"C:\Users\Parth Arora\OneDrive\Desktop\CyberShield")
CSV_PATH    = PROJECT_ROOT / "anti_india_dataset_with_orgs.csv"

df = pd.read_csv(CSV_PATH)   # loads the only sheet automatically
print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head(3)


Loaded shape: (600, 17)
Columns: ['platform', 'username', 'user_id', 'profile_link', 'content_text', 'language', 'hashtags', 'mentions', 'urls', 'timestamp', 'likes', 'shares', 'comments_count', 'media_type', 'media_url', 'ocr_text', 'Unnamed: 16']


Unnamed: 0,platform,username,user_id,profile_link,content_text,language,hashtags,mentions,urls,timestamp,likes,shares,comments_count,media_type,media_url,ocr_text,Unnamed: 16
0,telegram,mitchellbaker,374755017,https://telegram.com/mitchellbaker,Cost city north mind security final here signi...,hi,"['#FreeKashmir', '#StandWithXYZ']",['@altmedia'],['https://howard.com/'],2025-07-13T11:12:48Z,600,289,190,text,https://dummyimage.com/898x267,step change rest future India answer human,
1,instagram,jonathanjohnson,109873839,https://instagram.com/jonathanjohnson,Provide shake base arrive risk raise home glas...,en,"['#BoycottIndia', '#IndiaOut']",['@intlwatch'],['https://www.moran.org/'],2025-07-09T07:26:40Z,808,263,184,video,https://placeimg.com/26/326/any,notice benefit arm necessary India person into...,
2,twitter,newtonjane,866533357,https://twitter.com/newtonjane,If agreement full market movement minute espec...,ur,"['#FreeKashmir', '#StandWithXYZ']",['@freevoice'],['http://www.underwood.org/'],2025-07-27T12:01:18Z,684,329,200,image,https://placeimg.com/908/818/any,play action shake term according course tough ...,


In [4]:
# 2) Install & load spaCy (safe even if already installed)
import sys, subprocess

try:
    import spacy
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "spacy"])
    import spacy

# Try to load en_core_web_sm; if missing, download then load
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    import spacy
    nlp = spacy.load("en_core_web_sm")

nlp.max_length = 2_000_000  # just in case some rows are long
print("spaCy model loaded ✅")


spaCy model loaded ✅


In [5]:
# 3) Build a single text column to analyze.
#    Priority: content_text; if empty, use ocr_text; else empty string.
content_col = "content_text"
ocr_col     = "ocr_text"

for col in [content_col, ocr_col]:
    if col not in df.columns:
        df[col] = ""

df["text_for_ner"] = df[content_col].fillna("").astype(str)
df.loc[df["text_for_ner"].str.strip().eq(""), "text_for_ner"] = df.loc[
    df["text_for_ner"].str.strip().eq(""), ocr_col
].fillna("").astype(str)

# Optional trim (helps with extremely long OCR)
df["text_for_ner"] = df["text_for_ner"].str.slice(0, 5000)

df[["platform","username","text_for_ner"]].head(5)


Unnamed: 0,platform,username,text_for_ner
0,telegram,mitchellbaker,Cost city north mind security final here signi...
1,instagram,jonathanjohnson,Provide shake base arrive risk raise home glas...
2,twitter,newtonjane,If agreement full market movement minute espec...
3,instagram,sarahlopez,Church step reflect administration too operati...
4,instagram,ejones,Economic great old member hand art edge too ni...


In [6]:
# 4) NER with batching for speed
from tqdm.auto import tqdm
tqdm.pandas()

TARGET_LABELS = {"PERSON", "ORG", "GPE", "LOC"}

texts = df["text_for_ner"].fillna("").astype(str).tolist()
ents_per_row = []

for doc in tqdm(nlp.pipe(texts, batch_size=64, disable=["tagger","lemmatizer","morphologizer","parser"]),
                total=len(texts), desc="NER"):
    ents_per_row.append([(ent.text, ent.label_) for ent in doc.ents if ent.label_ in TARGET_LABELS])

df["entities"] = ents_per_row
df[["text_for_ner","entities"]].head(8)


  from .autonotebook import tqdm as notebook_tqdm
NER: 100%|██████████| 600/600 [00:01<00:00, 372.71it/s]


Unnamed: 0,text_for_ner,entities
0,Cost city north mind security final here signi...,"[(India Program, ORG)]"
1,Provide shake base arrive risk raise home glas...,"[(India Food, ORG)]"
2,If agreement full market movement minute espec...,"[(India Report, ORG)]"
3,Church step reflect administration too operati...,"[(India, GPE)]"
4,Economic great old member hand art edge too ni...,"[(India, GPE)]"
5,Shake include western thus successful side mus...,"[(India, GPE)]"
6,I a ahead show into company last. India Everyb...,"[(India Everybody, ORG)]"
7,Anyone image letter street page couple employe...,"[(India Money, ORG)]"


In [7]:
# 5) Explode and normalize
ents = (
    df.explode("entities")
      .dropna(subset=["entities"])
      .reset_index(drop=True)
)

# split tuple -> columns
ents[["entity","label"]] = pd.DataFrame(ents["entities"].tolist(), index=ents.index)
ents = ents.drop(columns=["entities"])

# normalize whitespace
ents["entity_norm"] = ents["entity"].astype(str).str.strip().str.replace(r"\s+", " ", regex=True)

# optional: remove 1-2 char noise
ents = ents[ents["entity_norm"].str.len() >= 3].reset_index(drop=True)

# frequency by label & entity
freq = (ents.groupby(["label","entity_norm"])
            .size()
            .reset_index(name="count")
            .sort_values(["label","count"], ascending=[True, False])
            .reset_index(drop=True))

print("Entities rows:", ents.shape, "   Unique entities:", freq.shape[0])
freq.head(15)


Entities rows: (628, 21)    Unique entities: 261


Unnamed: 0,label,entity_norm,count
0,GPE,India,297
1,GPE,India Base,2
2,GPE,India Approach,1
3,GPE,India Away,1
4,GPE,India City,1
5,GPE,India Direction,1
6,GPE,India Easy,1
7,GPE,India Husband,1
8,GPE,India Page,1
9,GPE,India Various,1


In [8]:
def topk_per_label(freq_df, k=15):
    return (freq_df.sort_values("count", ascending=False)
                   .groupby("label")
                   .head(k)
                   .reset_index(drop=True))

topk = topk_per_label(freq, k=15)
persons = topk[topk["label"]=="PERSON"]
orgs    = topk[topk["label"]=="ORG"]
places  = topk[topk["label"].isin(["GPE","LOC"])]

persons, orgs, places


(     label     entity_norm  count
 2   PERSON             Mrs      6
 5   PERSON      India Song      3
 9   PERSON          Detail      2
 11  PERSON       India Law      2
 14  PERSON           Glass      2
 24  PERSON     India Order      1
 25  PERSON     India Offer      1
 27  PERSON        Tell Mrs      1
 29  PERSON      India Yard      1
 30  PERSON  India Together      1
 31  PERSON    India Strong      1
 32  PERSON    India Speech      1
 33  PERSON      India Skin      1
 34  PERSON     India Skill      1
 35  PERSON      India Sing      1,
    label           entity_norm  count
 1    ORG              Congress      9
 3    ORG          India Weight      4
 4    ORG       India Financial      3
 6    ORG          India Future      3
 7    ORG       India Something      3
 12   ORG          India Across      2
 13   ORG  India Administration      2
 15   ORG           India Bring      2
 16   ORG            India Call      2
 17   ORG         India Capital      2
 18   ORG 

In [10]:
# 7) Save CSVs in your CyberShield folder
freq_csv      = PROJECT_ROOT / "ner_entity_freq.csv"
rows_csv      = PROJECT_ROOT / "ner_entity_rows.csv"
author_edges  = PROJECT_ROOT / "ner_author_entity_edges.csv"  # for graphs later

# author–entity edges (dedup) for graph building
edge_cols = []
if "username" in ents.columns: edge_cols.append("username")
if "platform" in ents.columns: edge_cols.append("platform")
edge_cols = (edge_cols + ["entity_norm","label"]) if edge_cols else ["entity_norm","label"]

author_entity = ents[edge_cols].drop_duplicates().reset_index(drop=True)

freq.to_csv(freq_csv, index=False, encoding="utf-8")
ents.to_csv(rows_csv, index=False, encoding="utf-8")
author_entity.to_csv(author_edges, index=False, encoding="utf-8")

print("Saved:\n -", freq_csv, "\n -", rows_csv, "\n -", author_edges)


Saved:
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\ner_entity_freq.csv 
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\ner_entity_rows.csv 
 - C:\Users\Parth Arora\OneDrive\Desktop\CyberShield\ner_author_entity_edges.csv
