In [3]:
"""
generate_named_entity_wordclouds.py

Purpose:
  - Load two CSVs containing a "Corrected Question(s)" column (user provided file paths).
  - Run spaCy NER on each question to extract named entities.
  - Clean / normalize extracted entities (remove stopwords, single chars, numbers, generic tokens).
  - Produce wordcloud images for: 1) entities from file A, 2) entities from file B, 3) combined entities.
  - Produce CSV reports with entity frequencies and top entities by label (PERSON, GPE, ORG, LOC, etc.).

Usage:
  - Edit the FILE_PATHS dictionary below to point to your local CSVs if different.
  - Run: python3 generate_named_entity_wordclouds.py

Outputs (saved to OUTPUT_DIR):
  - fileA_named_entities.csv  (all entities + labels + frequencies)
  - fileB_named_entities.csv
  - combined_named_entities.csv
  - fileA_named_entities_wordcloud.png
  - fileB_named_entities_wordcloud.png
  - combined_named_entities_wordcloud.png

Notes / tips:
  - The script will try to download NLTK stopwords and the spaCy "en_core_web_sm" model if missing.
  - Make sure Python packages are installed: pandas, spacy, wordcloud, matplotlib, nltk
    Example:
      pip install pandas spacy wordcloud matplotlib nltk
      python -m spacy download en_core_web_sm

Author: ChatGPT (assistant)
"""

import os
import sys
import re
import argparse
import logging
from collections import Counter, defaultdict

import pandas as pd

# Try to import libraries and provide friendly error messages
try:
    import spacy
except Exception as e:
    print("spaCy is required. Install with: pip install spacy")
    raise

try:
    from wordcloud import WordCloud
except Exception as e:
    print("wordcloud is required. Install with: pip install wordcloud")
    raise

import matplotlib.pyplot as plt

try:
    import nltk
    from nltk.corpus import stopwords
except Exception as e:
    print("nltk is required. Install with: pip install nltk")
    raise

# -------------------------------
# CONFIG - edit these paths if needed
# -------------------------------
FILE_PATHS = {
    "fileA": r"/Users/anjalisingh/Desktop/IITP/WordCloud/Full Dataset - Final Dataset.csv",
    "fileB": r"/Users/anjalisingh/Desktop/IITP/WordCloud/Full Dataset - 3_hop_questions.csv",
}

OUTPUT_DIR = r"/Users/anjalisingh/Desktop/IITP/WordCloud/output"
# Column name (as provided in your examples)
# Accept both variants present in your files (3-hop uses "Corrected Questions")
QUESTION_COLUMN_CANDIDATES = ["Corrected Question", "Corrected Questions"]  # order: final dataset, 3-hop


# Entities to keep — spaCy labels we care most about
KEEP_LABELS = {"PERSON", "GPE", "LOC", "ORG", "NORP", "FAC", "PRODUCT", "EVENT", "WORK_OF_ART"}

# Additional tokens to drop (lowercase)
EXTRA_STOPWORDS = {
    "which", "what", "where", "when", "who", "that", "same", "state", "india",
    "indian", "associated", "located", "located", "famous", "known",
    "whichis", "etc", "etc."
}

# -------------------------------
# Helpers
# -------------------------------

def ensure_output_dir(path):
    os.makedirs(path, exist_ok=True)


def try_download_resources():
    # NLTK stopwords
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        print("Downloading NLTK stopwords...")
        nltk.download("stopwords")

    # spaCy model
    try:
        spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading spaCy 'en_core_web_sm' model (this may take a moment)...")
        from spacy.cli import download

        download("en_core_web_sm")


def load_csv_questions(path, question_cols=QUESTION_COLUMN_CANDIDATES):
    """
    Load CSV and return list of questions and the dataframe. Supports multiple possible column names.
    """
    # Try a few encodings and fallback strategies
    encodings = ["utf-8", "ISO-8859-1", "latin1"]
    for enc in encodings:
        try:
            df = pd.read_csv(path, encoding=enc)
            break
        except Exception:
            df = None
    if df is None:
        raise ValueError(f"Could not read CSV at {path} with tried encodings: {encodings}")

    # Normalize column names (strip)
    df.columns = [c.strip() for c in df.columns]

    # Try to find a matching question column from the candidates list
    found_col = None
    lower_map = {c.lower(): c for c in df.columns}
    for qc in question_cols:
        if qc in df.columns:
            found_col = qc
            break
        if qc.lower() in lower_map:
            found_col = lower_map[qc.lower()]
            break

    if not found_col:
        raise KeyError(f"None of the expected question columns {question_cols} found in {path}. Available columns: {df.columns.tolist()}")

    # Dropna and ensure strings
    questions = df[found_col].dropna().astype(str).tolist()
    return questions, df


def clean_entity_text(text):
    # Normalize whitespace and punctuation, keep basic tokens
    text = text.strip()
    # Remove leading/trailing punctuation
    text = re.sub(r"^[^\w]+|[^\w]+$", "", text)
    # Replace newlines and multiple spaces
    text = re.sub(r"\s+", " ", text)
    return text


def is_useful_entity(ent_text):
    if not ent_text:
        return False
    ent = ent_text.strip()
    # remove single characters and purely numeric tokens
    if len(ent) <= 1:
        return False
    if ent.isnumeric():
        return False
    # remove tokens that are just punctuation
    if all(not ch.isalnum() for ch in ent):
        return False
    # remove generic tokens
    low = ent.lower()
    if low in EXTRA_STOPWORDS:
        return False
    return True


def extract_entities_from_questions(nlp, questions, keep_labels=KEEP_LABELS):
    entities = []
    label_map = []
    # process in batches for speed
    for doc in nlp.pipe(questions, disable=["parser", "tagger"]):
        for ent in doc.ents:
            label = ent.label_
            if keep_labels and label not in keep_labels:
                # we can still keep some GPE/LOC/ORG etc; otherwise skip
                continue
            text = clean_entity_text(ent.text)
            if is_useful_entity(text):
                entities.append(text)
                label_map.append((text, label))
    return entities, label_map


def normalize_entity_key(e):
    # Lowercase but keep capitalization for appearance if needed.
    # Here, we return a canonical lowercase key, but preserve original form later if required.
    return e.lower()


def build_frequency_counters(entities):
    freq = Counter()
    for e in entities:
        k = normalize_entity_key(e)
        freq[k] += 1
    return freq


def save_top_entities_csv(freq_counter, label_pairs, out_csv_path, top_n=200):
    # label_pairs is list of (entity_original_text, label)
    # We'll compute the most common tokens and their labels (most frequent label for that key)
    label_counter = defaultdict(Counter)
    for orig, label in label_pairs:
        key = normalize_entity_key(orig)
        label_counter[key][label] += 1

    rows = []
    for ent_key, count in freq_counter.most_common(top_n):
        most_common_label = None
        if ent_key in label_counter:
            most_common_label = label_counter[ent_key].most_common(1)[0][0]
        rows.append((ent_key, most_common_label or "", count))

    df = pd.DataFrame(rows, columns=["entity", "label", "count"])
    df.to_csv(out_csv_path, index=False, encoding="utf-8")


def generate_and_save_wordcloud(freq_counter, out_path, title=None, max_words=300):
    # Filter very short keys (just in case) and convert keys back to display form
    filtered = {k: v for k, v in freq_counter.items() if len(k) > 1 and not k.isnumeric()}
    if not filtered:
        print(f"No tokens to render for {out_path}")
        return

    wc = WordCloud(width=1200, height=600, background_color="white", collocations=False, max_words=max_words)
    wc.generate_from_frequencies(filtered)

    plt.figure(figsize=(14, 7))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    if title:
        plt.title(title, fontsize=18)
    plt.tight_layout()
    plt.savefig(out_path, dpi=300)
    plt.close()


# -------------------------------
# Main
# -------------------------------

def main():
    ensure_output_dir(OUTPUT_DIR)
    try_download_resources()

    # load spaCy model
    nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])  # we only need NER

    # Build a combined stopword set
    nltk_stop = set(stopwords.words('english'))
    stop_words = set([w.lower() for w in nltk_stop]) | EXTRA_STOPWORDS

    all_entities = []
    all_labels = []

    reports = {}

    for key, path in FILE_PATHS.items():
        print(f"Processing {key} -> {path}")
        questions, df = load_csv_questions(path)
        entities, label_pairs = extract_entities_from_questions(nlp, questions)

        # Filter entities further using stopwords: remove entities that are just stopwords
        entities = [e for e in entities if e.lower() not in stop_words]

        freq = build_frequency_counters(entities)
        reports[key] = {
            "entities": entities,
            "label_pairs": label_pairs,
            "freq": freq,
            "df": df,
        }

        # Save CSV report for this file
        out_csv = os.path.join(OUTPUT_DIR, f"{key}_named_entities.csv")
        save_top_entities_csv(freq, label_pairs, out_csv, top_n=1000)
        print(f"Saved entity frequency CSV to: {out_csv}")

        # Save wordcloud
        out_png = os.path.join(OUTPUT_DIR, f"{key}_named_entities_wordcloud.png")
        generate_and_save_wordcloud(freq, out_png, title=f"Named Entities: {key}")
        print(f"Saved wordcloud to: {out_png}")

        all_entities.extend(entities)
        all_labels.extend(label_pairs)

    # Combined
    combined_freq = build_frequency_counters(all_entities)
    save_top_entities_csv(combined_freq, all_labels, os.path.join(OUTPUT_DIR, "combined_named_entities.csv"), top_n=2000)
    generate_and_save_wordcloud(combined_freq, os.path.join(OUTPUT_DIR, "combined_named_entities_wordcloud.png"), title="Combined Named Entities")

    print("Done. Check the output directory:", OUTPUT_DIR)


if __name__ == '__main__':
    main()


Processing fileA -> /Users/anjalisingh/Desktop/IITP/WordCloud/Full Dataset - Final Dataset.csv




Saved entity frequency CSV to: /Users/anjalisingh/Desktop/IITP/WordCloud/output/fileA_named_entities.csv
Saved wordcloud to: /Users/anjalisingh/Desktop/IITP/WordCloud/output/fileA_named_entities_wordcloud.png
Processing fileB -> /Users/anjalisingh/Desktop/IITP/WordCloud/Full Dataset - 3_hop_questions.csv




Saved entity frequency CSV to: /Users/anjalisingh/Desktop/IITP/WordCloud/output/fileB_named_entities.csv
Saved wordcloud to: /Users/anjalisingh/Desktop/IITP/WordCloud/output/fileB_named_entities_wordcloud.png
Done. Check the output directory: /Users/anjalisingh/Desktop/IITP/WordCloud/output
