In [2]:
#!/usr/bin/env python3
"""
Production-ready script to generate wordclouds from a cultural artifacts dataset.

Usage (CLI):
    python wordcloud_generator_for_cultural_artifacts.py \
        --input "/path/to/cultural-artifact-gold-sheet-final.csv" \
        --output_dir "/path/to/output/dir" \
        --identifier_cols Identifier1 Identifier2

Or in a Jupyter notebook: just run the script; it will auto-detect a CSV in cwd or
create a tiny sample CSV to demonstrate functionality.

Requirements:
    pip install pandas spacy wordcloud matplotlib nltk tqdm
    python -m spacy download en_core_web_sm
"""

import argparse
import os
import re
import sys
import logging
import glob
from collections import Counter

import pandas as pd
from wordcloud import WordCloud
import matplotlib
# Use non-interactive backend for headless environments
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# spaCy and NLTK
import spacy
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# --------------------------- Helper utilities ---------------------------

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[logging.StreamHandler(sys.stdout)],
    )


def ensure_nltk_stopwords():
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        logging.info("Downloading NLTK stopwords...")
        nltk.download("stopwords")


def load_spacy_model(model_name="en_core_web_sm"):
    try:
        nlp = spacy.load(model_name)
    except OSError:
        logging.info(f"spaCy model {model_name} not found. Attempting to download...")
        from spacy.cli import download

        download(model_name)
        nlp = spacy.load(model_name)
    return nlp


def sanitize_text(text: str) -> str:
    if pd.isna(text):
        return ""
    # remove newlines and weird whitespace
    return re.sub(r"\s+", " ", str(text)).strip()


def detect_candidate_columns(df: pd.DataFrame):
    """Detect artifact column and identifier columns if not explicitly provided."""
    col_lower = {c.lower(): c for c in df.columns}

    artifact_candidates = [
        "unique artifact",
        "cultural artifact",
        "artifact",
        "unique_artifact",
        "cultural artifact",
    ]
    artifact_col = None
    for cand in artifact_candidates:
        if cand in col_lower:
            artifact_col = col_lower[cand]
            break

    # identifier columns: any column whose name starts with 'identifier' (case-insensitive)
    identifier_cols = [c for c in df.columns if c.lower().startswith("identifier")]

    # fallback: look for numbered identifier columns
    if not identifier_cols:
        for i in range(1, 6):
            name = f"identifier{i}"
            if name in col_lower:
                identifier_cols.append(col_lower[name])

    # final fallback: pick columns likely to contain short descriptor text
    if not artifact_col:
        possible = [c for c in df.columns if "artifact" in c.lower() or "unique" in c.lower()]
        artifact_col = possible[0] if possible else df.columns[0]

    return artifact_col, identifier_cols


# --------------------------- NER + cleaning ---------------------------

def extract_named_entities_from_text(nlp, text, accepted_labels=None):
    """Return a list of extracted entity strings from text using spaCy NER.
    accepted_labels: list of spaCy entity labels to keep (e.g., ['PERSON','GPE','ORG','LOC']).
    If None, keep most labels except DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL, CARDINAL.
    """

    if not text:
        return []

    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if accepted_labels:
            if ent.label_ in accepted_labels:
                entities.append(ent.text)
        else:
            if ent.label_ not in {"DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"}:
                entities.append(ent.text)
    return entities


def clean_entity_token(token: str, stop_words_set, exclude_names_set):
    t = token.strip()
    # remove surrounding punctuation
    t = re.sub(r"^[\W_]+|[\W_]+$", "", t)
    # remove tokens that are pure digits or too short
    if not t:
        return None
    if re.fullmatch(r"\d+", t):
        return None
    if len(t) <= 1:
        return None
    low = t.lower()
    if low in stop_words_set:
        return None
    # remove tokens that are exactly in exclude list
    if low in exclude_names_set:
        return None
    # remove tokens that look like column headers
    if re.match(r"^identifier\d*$", low):
        return None
    return t


# --------------------------- Wordcloud + plotting ---------------------------


def save_wordcloud_from_freq(freq_dict, title, output_path, width=1600, height=800):
    if not freq_dict:
        logging.warning(f"No data to generate wordcloud for {title}")
        return None

    wc = WordCloud(width=width, height=height, background_color="white", collocations=False)
    wc.generate_from_frequencies(freq_dict)
    plt.figure(figsize=(width / 200, height / 200))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=18)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.tight_layout()
    plt.savefig(output_path, dpi=150)
    plt.close()
    logging.info(f"Saved wordcloud: {output_path}")
    return output_path


# --------------------------- Main processing ---------------------------


def process_file(
    input_csv,
    output_dir,
    identifier_cols=None,
    artifact_col=None,
    exclude_names=None,
    accepted_entity_labels=None,
    top_k=500,
):
    setup_logging()
    ensure_nltk_stopwords()

    stop_words_set = set(stopwords.words("english"))

    # Default names to exclude (user requested) + common first names that may appear as noise.
    default_exclude = {
        "anjali",
        "shivani",
        "shampy",
        "sahili",
        "shivangi",
        "anjalisingh",
    }
    if exclude_names:
        exclude_names_set = set([n.strip().lower() for n in exclude_names]) | default_exclude
    else:
        exclude_names_set = default_exclude

    logging.info("Loading spaCy model...")
    nlp = load_spacy_model()

    logging.info(f"Reading CSV file: {input_csv}")
    # try common encodings
    try:
        df = pd.read_csv(input_csv)
    except Exception as e:
        logging.warning(f"Default read failed: {e}. Trying ISO-8859-1...")
        try:
            df = pd.read_csv(input_csv, encoding="ISO-8859-1")
        except Exception as e2:
            logging.error(f"Failed to read CSV: {e2}")
            raise

    original_columns = list(df.columns)
    logging.info(f"Columns detected: {original_columns}")

    # detect artifact and identifier columns if not provided
    detected_artifact_col, detected_identifier_cols = detect_candidate_columns(df)
    artifact_col = artifact_col or detected_artifact_col
    if identifier_cols:
        # verify provided columns exist
        identifier_cols = [c for c in identifier_cols if c in df.columns]
    else:
        identifier_cols = detected_identifier_cols

    logging.info(f"Using artifact column: {artifact_col}")
    logging.info(f"Using identifier columns: {identifier_cols}")

    # prepare output dir
    os.makedirs(output_dir, exist_ok=True)

    # --- Unique artifacts word frequencies ---
    logging.info("Computing unique artifacts frequencies...")
    artifacts = df[artifact_col].dropna().astype(str).apply(sanitize_text)
    artifact_freq = Counter()
    artifact_unique_set = set()
    for art in artifacts:
        if not art:
            continue
        artifact_freq[art] += 1
        artifact_unique_set.add(art)

    artifact_token_freq = Counter()
    artifact_whole_freq = Counter()
    for art, cnt in artifact_freq.items():
        artifact_whole_freq[art.replace(" ", "_")] += cnt
        for token in re.split(r"[\s,/|;-]+", art):
            t = token.strip()
            tclean = clean_entity_token(t, stop_words_set, exclude_names_set)
            if tclean:
                artifact_token_freq[tclean] += cnt

    # --- Identifier columns: collect NERs across those columns ---
    logging.info("Extracting named entities from identifier columns...")
    all_ident_text = []
    for col in identifier_cols:
        if col not in df.columns:
            continue
        text_series = df[col].fillna("").astype(str).apply(sanitize_text)
        all_ident_text.extend(text_series.tolist())

    # We'll concatenate texts into chunks to avoid overhead of calling spaCy on extremely long single string.
    combined_texts = []
    chunk_size = 200  # number of rows per spaCy pass; adjustable
    for i in range(0, len(all_ident_text), chunk_size):
        combined_texts.append(" ".join(all_ident_text[i : i + chunk_size]))

    entity_counter = Counter()
    for chunk in tqdm(combined_texts, desc="spaCy NER chunks"):
        ents = extract_named_entities_from_text(nlp, chunk, accepted_labels=accepted_entity_labels)
        for ent in ents:
            # clean entity using same function, but keep multi-word entities
            cleaned_phrase_tokens = []
            for token in re.split(r"[\s,/|;:-]+", ent):
                cleaned = clean_entity_token(token, stop_words_set, exclude_names_set)
                if cleaned:
                    cleaned_phrase_tokens.append(cleaned)

            # if phrase tokens exist, count both full phrase and tokens
            if cleaned_phrase_tokens:
                phrase = " ".join(cleaned_phrase_tokens)
                entity_counter[phrase] += 1
                for t in cleaned_phrase_tokens:
                    entity_counter[t] += 1

    # Remove any residual column-names or extremely generic tokens from entity_counter
    for bad in ["identifier", "identifiers"]:
        if bad in entity_counter:
            del entity_counter[bad]

    # limit size
    most_common_entities = dict(entity_counter.most_common(top_k))

    # combined frequencies: artifact tokens + entities
    combined_freq = Counter()
    combined_freq.update(artifact_token_freq)
    combined_freq.update(most_common_entities)

    # ---------------- Save outputs ----------------
    # Wordclouds
    logging.info("Generating wordclouds and saving results...")
    artifacts_wordcloud_path = os.path.join(output_dir, "artifacts_whole_wordcloud.png")
    save_wordcloud_from_freq(dict(artifact_whole_freq.most_common(1000)), "Artifacts (whole names)", artifacts_wordcloud_path)

    artifact_token_wordcloud_path = os.path.join(output_dir, "artifacts_tokens_wordcloud.png")
    save_wordcloud_from_freq(dict(artifact_token_freq.most_common(1000)), "Artifacts (tokens)", artifact_token_wordcloud_path)

    entities_wordcloud_path = os.path.join(output_dir, "identifiers_named_entities_wordcloud.png")
    save_wordcloud_from_freq(most_common_entities, "Identifier Named Entities", entities_wordcloud_path)

    combined_wordcloud_path = os.path.join(output_dir, "combined_artifacts_entities_wordcloud.png")
    save_wordcloud_from_freq(dict(combined_freq.most_common(1500)), "Combined Artifacts + Entities", combined_wordcloud_path)

    # Save frequency CSVs
    logging.info("Saving frequency CSVs...")
    pd.DataFrame(artifact_freq.most_common(), columns=["artifact_name", "count"]).to_csv(
        os.path.join(output_dir, "artifact_freq.csv"), index=False
    )
    pd.DataFrame(most_common_entities.items(), columns=["entity", "count"]).to_csv(
        os.path.join(output_dir, "identifier_entity_freq.csv"), index=False
    )
    pd.DataFrame(combined_freq.most_common(), columns=["token", "count"]).to_csv(
        os.path.join(output_dir, "combined_token_freq.csv"), index=False
    )

    logging.info("Processing complete. Outputs saved to: %s", os.path.abspath(output_dir))
    return {
        "artifact_whole_wordcloud": artifacts_wordcloud_path,
        "artifact_token_wordcloud": artifact_token_wordcloud_path,
        "entities_wordcloud": entities_wordcloud_path,
        "combined_wordcloud": combined_wordcloud_path,
        "artifact_freq_csv": os.path.join(output_dir, "artifact_freq.csv"),
        "entity_freq_csv": os.path.join(output_dir, "identifier_entity_freq.csv"),
    }


# --------------------------- CLI ---------------------------


def parse_args():
    p = argparse.ArgumentParser(description="Generate wordclouds from cultural artifact CSV.")
    # make optional here and validate later so script is notebook-friendly
    p.add_argument("--input", "-i", help="Input CSV file path")
    p.add_argument("--output_dir", "-o", help="Output directory to save wordclouds and CSVs")
    p.add_argument(
        "--identifier_cols",
        "-id",
        nargs="*",
        help="Identifier column names (space separated). If omitted, script auto-detects columns starting with 'Identifier'",
    )
    p.add_argument(
        "--artifact_col",
        "-a",
        help="Artifact column name. If omitted, script will attempt to auto-detect common artifact column names.",
    )
    p.add_argument(
        "--exclude_names",
        "-e",
        nargs="*",
        help="List of names to exclude from entity tokens (space separated).",
    )
    p.add_argument(
        "--top_k",
        type=int,
        default=500,
        help="How many top entities to keep for generating wordclouds (default: 500)",
    )
    return p.parse_args()


def create_sample_csv(path):
    logging.info("Creating a tiny sample CSV for demo purposes at: %s", path)
    sample = pd.DataFrame(
        {
            "Unique Artifact": [
                "Terracotta horse",
                "Bronze bell",
                "Stone sculpture of Vishnu",
                "Handloom sari",
                "Batik textile",
            ],
            "Identifier1": [
                "Patna district, Bihar",
                "Pataliputra museum",
                "8th century temple",
                "Weaver: Anjali Singh",
                "Maker: Shivani Rao",
            ],
            "Identifier2": [
                "Terracotta",
                "Bronze",
                "Stone",
                "Textile",
                "Textile",
            ],
        }
    )
    sample.to_csv(path, index=False)
    return path


def main():
    setup_logging()
    args = parse_args()

    # If user didn't pass CLI args and we're in an interactive environment, try to auto-fill sensible defaults
    is_interactive = False
    try:
        # presence of get_ipython indicates notebook/ipython
        is_interactive = "get_ipython" in globals()
    except Exception:
        is_interactive = False

    input_csv = args.input
    output_dir = args.output_dir or "./wordcloud_output"
    identifier_cols = args.identifier_cols
    artifact_col = args.artifact_col
    exclude_names = args.exclude_names
    top_k = args.top_k

    # If no input provided, try to find a CSV in cwd (useful for notebook runs)
    if not input_csv:
        csvs = glob.glob("*.csv")
        if csvs:
            input_csv = csvs[0]
            logging.info("No --input provided. Auto-using first CSV in cwd: %s", input_csv)
        elif is_interactive:
            # create a sample CSV in cwd for quick testing
            sample_path = os.path.join(os.getcwd(), "sample_cultural_artifacts_demo.csv")
            input_csv = create_sample_csv(sample_path)
            logging.info("No CSV found in cwd; using generated sample CSV: %s", input_csv)
        else:
            logging.error("No --input provided and not in interactive mode. Please provide --input and --output_dir.")
            print("Example CLI usage:\n  python script.py --input /path/to/file.csv --output_dir /path/to/outdir")
            sys.exit(2)

    if not output_dir:
        # already set above, but ensure non-empty
        output_dir = "./wordcloud_output"
        logging.info("No --output_dir provided; defaulting to %s", output_dir)

    result = process_file(
        input_csv=input_csv,
        output_dir=output_dir,
        identifier_cols=identifier_cols,
        artifact_col=artifact_col,
        exclude_names=exclude_names,
        top_k=top_k,
    )

    logging.info("Finished. Result paths: %s", result)


if __name__ == "__main__":
    main()



usage: ipykernel_launcher.py [-h] [--input INPUT] [--output_dir OUTPUT_DIR]
                             [--identifier_cols [IDENTIFIER_COLS ...]]
                             [--artifact_col ARTIFACT_COL]
                             [--exclude_names [EXCLUDE_NAMES ...]]
                             [--top_k TOP_K]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/anjalisingh/Library/Jupyter/runtime/kernel-v3c837de481c8781c0b020b95ef94903aa168f27d4.json


SystemExit: 2

In [3]:
#!/usr/bin/env python3
"""
Production-ready script to generate wordclouds from a cultural artifacts dataset.

Usage (CLI):
    python wordcloud_generator_for_cultural_artifacts.py \
        --input "/path/to/cultural-artifact-gold-sheet-final.csv" \
        --output_dir "/path/to/output/dir" \
        --identifier_cols Identifier1 Identifier2

Or in a Jupyter notebook: just run the script; it will auto-detect a CSV in cwd or
create a tiny sample CSV to demonstrate functionality.

Requirements:
    pip install pandas spacy wordcloud matplotlib nltk tqdm
    python -m spacy download en_core_web_sm
"""

import argparse
import os
import re
import sys
import logging
import glob
from collections import Counter

import pandas as pd
from wordcloud import WordCloud
import matplotlib
# Use non-interactive backend for headless environments
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# spaCy and NLTK
import spacy
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# --------------------------- Helper utilities ---------------------------

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[logging.StreamHandler(sys.stdout)],
    )


def ensure_nltk_stopwords():
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        logging.info("Downloading NLTK stopwords...")
        nltk.download("stopwords")


def load_spacy_model(model_name="en_core_web_sm"):
    try:
        nlp = spacy.load(model_name)
    except OSError:
        logging.info(f"spaCy model {model_name} not found. Attempting to download...")
        from spacy.cli import download

        download(model_name)
        nlp = spacy.load(model_name)
    return nlp


def sanitize_text(text: str) -> str:
    if pd.isna(text):
        return ""
    # remove newlines and weird whitespace
    return re.sub(r"\s+", " ", str(text)).strip()


def detect_candidate_columns(df: pd.DataFrame):
    """Detect artifact column and identifier columns if not explicitly provided."""
    col_lower = {c.lower(): c for c in df.columns}

    artifact_candidates = [
        "unique artifact",
        "cultural artifact",
        "artifact",
        "unique_artifact",
        "cultural artifact",
    ]
    artifact_col = None
    for cand in artifact_candidates:
        if cand in col_lower:
            artifact_col = col_lower[cand]
            break

    # identifier columns: any column whose name starts with 'identifier' (case-insensitive)
    identifier_cols = [c for c in df.columns if c.lower().startswith("identifier")]

    # fallback: look for numbered identifier columns
    if not identifier_cols:
        for i in range(1, 6):
            name = f"identifier{i}"
            if name in col_lower:
                identifier_cols.append(col_lower[name])

    # final fallback: pick columns likely to contain short descriptor text
    if not artifact_col:
        possible = [c for c in df.columns if "artifact" in c.lower() or "unique" in c.lower()]
        artifact_col = possible[0] if possible else df.columns[0]

    return artifact_col, identifier_cols


# --------------------------- NER + cleaning ---------------------------

def extract_named_entities_from_text(nlp, text, accepted_labels=None):
    """Return a list of extracted entity strings from text using spaCy NER.
    accepted_labels: list of spaCy entity labels to keep (e.g., ['PERSON','GPE','ORG','LOC']).
    If None, keep most labels except DATE, TIME, PERCENT, MONEY, QUANTITY, ORDINAL, CARDINAL.
    """

    if not text:
        return []

    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if accepted_labels:
            if ent.label_ in accepted_labels:
                entities.append(ent.text)
        else:
            if ent.label_ not in {"DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"}:
                entities.append(ent.text)
    return entities


def clean_entity_token(token: str, stop_words_set, exclude_names_set):
    t = token.strip()
    # remove surrounding punctuation
    t = re.sub(r"^[\W_]+|[\W_]+$", "", t)
    # remove tokens that are pure digits or too short
    if not t:
        return None
    if re.fullmatch(r"\d+", t):
        return None
    if len(t) <= 1:
        return None
    low = t.lower()
    if low in stop_words_set:
        return None
    # remove tokens that are exactly in exclude list
    if low in exclude_names_set:
        return None
    # remove tokens that look like column headers
    if re.match(r"^identifier\d*$", low):
        return None
    return t


# --------------------------- Wordcloud + plotting ---------------------------


def save_wordcloud_from_freq(freq_dict, title, output_path, width=1600, height=800):
    if not freq_dict:
        logging.warning(f"No data to generate wordcloud for {title}")
        return None

    wc = WordCloud(width=width, height=height, background_color="white", collocations=False)
    wc.generate_from_frequencies(freq_dict)
    plt.figure(figsize=(width / 200, height / 200))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=18)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.tight_layout()
    plt.savefig(output_path, dpi=150)
    plt.close()
    logging.info(f"Saved wordcloud: {output_path}")
    return output_path


# --------------------------- Main processing ---------------------------


def process_file(
    input_csv,
    output_dir,
    identifier_cols=None,
    artifact_col=None,
    exclude_names=None,
    accepted_entity_labels=None,
    top_k=500,
):
    setup_logging()
    ensure_nltk_stopwords()

    stop_words_set = set(stopwords.words("english"))

    # Default names to exclude (user requested) + common first names that may appear as noise.
    default_exclude = {
        "anjali",
        "shivani",
        "shampy",
        "sahili",
        "shivangi",
        "anjalisingh",
    }
    if exclude_names:
        exclude_names_set = set([n.strip().lower() for n in exclude_names]) | default_exclude
    else:
        exclude_names_set = default_exclude

    logging.info("Loading spaCy model...")
    nlp = load_spacy_model()

    logging.info(f"Reading CSV file: {input_csv}")
    # try common encodings
    try:
        df = pd.read_csv(input_csv)
    except Exception as e:
        logging.warning(f"Default read failed: {e}. Trying ISO-8859-1...")
        try:
            df = pd.read_csv(input_csv, encoding="ISO-8859-1")
        except Exception as e2:
            logging.error(f"Failed to read CSV: {e2}")
            raise

    original_columns = list(df.columns)
    logging.info(f"Columns detected: {original_columns}")

    # detect artifact and identifier columns if not provided
    detected_artifact_col, detected_identifier_cols = detect_candidate_columns(df)
    artifact_col = artifact_col or detected_artifact_col
    if identifier_cols:
        # verify provided columns exist
        identifier_cols = [c for c in identifier_cols if c in df.columns]
    else:
        identifier_cols = detected_identifier_cols

    logging.info(f"Using artifact column: {artifact_col}")
    logging.info(f"Using identifier columns: {identifier_cols}")

    # prepare output dir
    os.makedirs(output_dir, exist_ok=True)

    # --- Unique artifacts word frequencies ---
    logging.info("Computing unique artifacts frequencies...")
    artifacts = df[artifact_col].dropna().astype(str).apply(sanitize_text)
    artifact_freq = Counter()
    artifact_unique_set = set()
    for art in artifacts:
        if not art:
            continue
        artifact_freq[art] += 1
        artifact_unique_set.add(art)

    artifact_token_freq = Counter()
    artifact_whole_freq = Counter()
    for art, cnt in artifact_freq.items():
        artifact_whole_freq[art.replace(" ", "_")] += cnt
        for token in re.split(r"[\s,/|;-]+", art):
            t = token.strip()
            tclean = clean_entity_token(t, stop_words_set, exclude_names_set)
            if tclean:
                artifact_token_freq[tclean] += cnt

    # --- Identifier columns: collect NERs across those columns ---
    logging.info("Extracting named entities from identifier columns...")
    all_ident_text = []
    for col in identifier_cols:
        if col not in df.columns:
            continue
        text_series = df[col].fillna("").astype(str).apply(sanitize_text)
        all_ident_text.extend(text_series.tolist())

    # We'll concatenate texts into chunks to avoid overhead of calling spaCy on extremely long single string.
    combined_texts = []
    chunk_size = 200  # number of rows per spaCy pass; adjustable
    for i in range(0, len(all_ident_text), chunk_size):
        combined_texts.append(" ".join(all_ident_text[i : i + chunk_size]))

    entity_counter = Counter()
    for chunk in tqdm(combined_texts, desc="spaCy NER chunks"):
        ents = extract_named_entities_from_text(nlp, chunk, accepted_labels=accepted_entity_labels)
        for ent in ents:
            # clean entity using same function, but keep multi-word entities
            cleaned_phrase_tokens = []
            for token in re.split(r"[\s,/|;:-]+", ent):
                cleaned = clean_entity_token(token, stop_words_set, exclude_names_set)
                if cleaned:
                    cleaned_phrase_tokens.append(cleaned)

            # if phrase tokens exist, count both full phrase and tokens
            if cleaned_phrase_tokens:
                phrase = " ".join(cleaned_phrase_tokens)
                entity_counter[phrase] += 1
                for t in cleaned_phrase_tokens:
                    entity_counter[t] += 1

    # Remove any residual column-names or extremely generic tokens from entity_counter
    for bad in ["identifier", "identifiers"]:
        if bad in entity_counter:
            del entity_counter[bad]

    # limit size
    most_common_entities = dict(entity_counter.most_common(top_k))

    # combined frequencies: artifact tokens + entities
    combined_freq = Counter()
    combined_freq.update(artifact_token_freq)
    combined_freq.update(most_common_entities)

    # ---------------- Save outputs ----------------
    # Wordclouds
    logging.info("Generating wordclouds and saving results...")
    artifacts_wordcloud_path = os.path.join(output_dir, "artifacts_whole_wordcloud.png")
    save_wordcloud_from_freq(dict(artifact_whole_freq.most_common(1000)), "Artifacts (whole names)", artifacts_wordcloud_path)

    artifact_token_wordcloud_path = os.path.join(output_dir, "artifacts_tokens_wordcloud.png")
    save_wordcloud_from_freq(dict(artifact_token_freq.most_common(1000)), "Artifacts (tokens)", artifact_token_wordcloud_path)

    entities_wordcloud_path = os.path.join(output_dir, "identifiers_named_entities_wordcloud.png")
    save_wordcloud_from_freq(most_common_entities, "Identifier Named Entities", entities_wordcloud_path)

    combined_wordcloud_path = os.path.join(output_dir, "combined_artifacts_entities_wordcloud.png")
    save_wordcloud_from_freq(dict(combined_freq.most_common(1500)), "Combined Artifacts + Entities", combined_wordcloud_path)

    # Save frequency CSVs
    logging.info("Saving frequency CSVs...")
    pd.DataFrame(artifact_freq.most_common(), columns=["artifact_name", "count"]).to_csv(
        os.path.join(output_dir, "artifact_freq.csv"), index=False
    )
    pd.DataFrame(most_common_entities.items(), columns=["entity", "count"]).to_csv(
        os.path.join(output_dir, "identifier_entity_freq.csv"), index=False
    )
    pd.DataFrame(combined_freq.most_common(), columns=["token", "count"]).to_csv(
        os.path.join(output_dir, "combined_token_freq.csv"), index=False
    )

    logging.info("Processing complete. Outputs saved to: %s", os.path.abspath(output_dir))
    return {
        "artifact_whole_wordcloud": artifacts_wordcloud_path,
        "artifact_token_wordcloud": artifact_token_wordcloud_path,
        "entities_wordcloud": entities_wordcloud_path,
        "combined_wordcloud": combined_wordcloud_path,
        "artifact_freq_csv": os.path.join(output_dir, "artifact_freq.csv"),
        "entity_freq_csv": os.path.join(output_dir, "identifier_entity_freq.csv"),
    }


# --------------------------- CLI ---------------------------


def parse_args():
    p = argparse.ArgumentParser(description="Generate wordclouds from cultural artifact CSV.")
    # make optional here and validate later so script is notebook-friendly
    p.add_argument("--input", "-i", help="Input CSV file path")
    p.add_argument("--output_dir", "-o", help="Output directory to save wordclouds and CSVs")
    p.add_argument(
        "--identifier_cols",
        "-id",
        nargs="*",
        help="Identifier column names (space separated). If omitted, script auto-detects columns starting with 'Identifier'",
    )
    p.add_argument(
        "--artifact_col",
        "-a",
        help="Artifact column name. If omitted, script will attempt to auto-detect common artifact column names.",
    )
    p.add_argument(
        "--exclude_names",
        "-e",
        nargs="*",
        help="List of names to exclude from entity tokens (space separated).",
    )
    p.add_argument(
        "--top_k",
        type=int,
        default=500,
        help="How many top entities to keep for generating wordclouds (default: 500)",
    )
    return p.parse_args()


def create_sample_csv(path):
    logging.info("Creating a tiny sample CSV for demo purposes at: %s", path)
    sample = pd.DataFrame(
        {
            "Unique Artifact": [
                "Terracotta horse",
                "Bronze bell",
                "Stone sculpture of Vishnu",
                "Handloom sari",
                "Batik textile",
            ],
            "Identifier1": [
                "Patna district, Bihar",
                "Pataliputra museum",
                "8th century temple",
                "Weaver: Anjali Singh",
                "Maker: Shivani Rao",
            ],
            "Identifier2": [
                "Terracotta",
                "Bronze",
                "Stone",
                "Textile",
                "Textile",
            ],
        }
    )
    sample.to_csv(path, index=False)
    return path


def main():
    setup_logging()
    args = parse_args()

    # If user didn't pass CLI args and we're in an interactive environment, try to auto-fill sensible defaults
    is_interactive = False
    try:
        # presence of get_ipython indicates notebook/ipython
        is_interactive = "get_ipython" in globals()
    except Exception:
        is_interactive = False

    input_csv = args.input
    output_dir = args.output_dir or "./wordcloud_output"
    identifier_cols = args.identifier_cols
    artifact_col = args.artifact_col
    exclude_names = args.exclude_names
    top_k = args.top_k

    # If no input provided, try to find a CSV in cwd (useful for notebook runs)
    if not input_csv:
        csvs = glob.glob("*.csv")
        if csvs:
            input_csv = csvs[0]
            logging.info("No --input provided. Auto-using first CSV in cwd: %s", input_csv)
        elif is_interactive:
            # create a sample CSV in cwd for quick testing
            sample_path = os.path.join(os.getcwd(), "sample_cultural_artifacts_demo.csv")
            input_csv = create_sample_csv(sample_path)
            logging.info("No CSV found in cwd; using generated sample CSV: %s", input_csv)
        else:
            logging.error("No --input provided and not in interactive mode. Please provide --input and --output_dir.")
            print("Example CLI usage:\n  python script.py --input /path/to/file.csv --output_dir /path/to/outdir")
            sys.exit(2)

    if not output_dir:
        # already set above, but ensure non-empty
        output_dir = "./wordcloud_output"
        logging.info("No --output_dir provided; defaulting to %s", output_dir)

    result = process_file(
        input_csv=input_csv,
        output_dir=output_dir,
        identifier_cols=identifier_cols,
        artifact_col=artifact_col,
        exclude_names=exclude_names,
        top_k=top_k,
    )

    logging.info("Finished. Result paths: %s", result)


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] [--input INPUT] [--output_dir OUTPUT_DIR]
                             [--identifier_cols [IDENTIFIER_COLS ...]]
                             [--artifact_col ARTIFACT_COL]
                             [--exclude_names [EXCLUDE_NAMES ...]]
                             [--top_k TOP_K]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/anjalisingh/Library/Jupyter/runtime/kernel-v3c837de481c8781c0b020b95ef94903aa168f27d4.json


SystemExit: 2

In [4]:
#!/usr/bin/env python3
"""
Production-ready script to generate wordclouds from a cultural artifacts dataset.

Usage (CLI):
    python wordcloud_generator_for_cultural_artifacts.py \
        --input "/path/to/cultural-artifact-gold-sheet-final.csv" \
        --output_dir "/path/to/output/dir" \
        --identifier_cols Identifier1 Identifier2

In Jupyter/interactive sessions the script will ignore kernel-injected args and:
 - auto-use the first CSV in cwd if present, OR
 - create a small sample CSV and run on it for demo.

Requirements:
    pip install pandas spacy wordcloud matplotlib nltk tqdm
    python -m spacy download en_core_web_sm
"""

import argparse
import os
import re
import sys
import logging
import glob
from collections import Counter

import pandas as pd
from wordcloud import WordCloud
import matplotlib
# use non-interactive backend so script works headless
matplotlib.use("Agg")
import matplotlib.pyplot as plt

import spacy
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# --------------------------- Helper utilities ---------------------------

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

def ensure_nltk_stopwords():
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        logging.info("Downloading NLTK stopwords...")
        nltk.download("stopwords")

def load_spacy_model(model_name="en_core_web_sm"):
    try:
        nlp = spacy.load(model_name)
    except OSError:
        logging.info(f"spaCy model {model_name} not found. Attempting to download...")
        from spacy.cli import download
        download(model_name)
        nlp = spacy.load(model_name)
    return nlp

def sanitize_text(text: str) -> str:
    if pd.isna(text):
        return ""
    return re.sub(r"\s+", " ", str(text)).strip()

def detect_candidate_columns(df: pd.DataFrame):
    """Detect artifact column and identifier columns if not explicitly provided."""
    col_lower = {c.lower(): c for c in df.columns}

    artifact_candidates = [
        "unique artifact",
        "cultural artifact",
        "artifact",
        "unique_artifact",
        "cultural artifact",
    ]
    artifact_col = None
    for cand in artifact_candidates:
        if cand in col_lower:
            artifact_col = col_lower[cand]
            break

    # identifier columns: any column whose name starts with 'identifier' (case-insensitive)
    identifier_cols = [c for c in df.columns if c.lower().startswith("identifier")]

    # fallback: look for numbered identifier columns
    if not identifier_cols:
        for i in range(1, 6):
            name = f"identifier{i}"
            if name in col_lower:
                identifier_cols.append(col_lower[name])

    # final fallback: pick columns likely to contain short descriptor text
    if not artifact_col:
        possible = [c for c in df.columns if "artifact" in c.lower() or "unique" in c.lower()]
        artifact_col = possible[0] if possible else df.columns[0]

    return artifact_col, identifier_cols

# --------------------------- NER + cleaning ---------------------------

def extract_named_entities_from_text(nlp, text, accepted_labels=None):
    if not text:
        return []
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if accepted_labels:
            if ent.label_ in accepted_labels:
                entities.append(ent.text)
        else:
            if ent.label_ not in {"DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"}:
                entities.append(ent.text)
    return entities

def clean_entity_token(token: str, stop_words_set, exclude_names_set):
    t = token.strip()
    t = re.sub(r"^[\W_]+|[\W_]+$", "", t)
    if not t:
        return None
    if re.fullmatch(r"\d+", t):
        return None
    if len(t) <= 1:
        return None
    low = t.lower()
    if low in stop_words_set:
        return None
    if low in exclude_names_set:
        return None
    if re.match(r"^identifier\d*$", low):
        return None
    return t

# --------------------------- Wordcloud + plotting ---------------------------

def save_wordcloud_from_freq(freq_dict, title, output_path, width=1600, height=800):
    if not freq_dict:
        logging.warning(f"No data to generate wordcloud for {title}")
        return None

    wc = WordCloud(width=width, height=height, background_color="white", collocations=False)
    wc.generate_from_frequencies(freq_dict)
    plt.figure(figsize=(width / 200, height / 200))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=18)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.tight_layout()
    plt.savefig(output_path, dpi=150)
    plt.close()
    logging.info(f"Saved wordcloud: {output_path}")
    return output_path

# --------------------------- Main processing ---------------------------

def process_file(
    input_csv,
    output_dir,
    identifier_cols=None,
    artifact_col=None,
    exclude_names=None,
    accepted_entity_labels=None,
    top_k=500,
):
    setup_logging()
    ensure_nltk_stopwords()

    stop_words_set = set(stopwords.words("english"))

    # Default exclude names (add more if needed)
    default_exclude = {
        "anjali",
        "shivani",
        "shampy",
        "sahili",
        "shivangi",
        "anjalisingh",
    }
    exclude_names_set = (set(n.strip().lower() for n in exclude_names) | default_exclude) if exclude_names else default_exclude

    logging.info("Loading spaCy model...")
    nlp = load_spacy_model()

    logging.info(f"Reading CSV file: {input_csv}")
    try:
        df = pd.read_csv(input_csv)
    except Exception as e:
        logging.warning(f"Default read failed: {e}. Trying ISO-8859-1...")
        try:
            df = pd.read_csv(input_csv, encoding="ISO-8859-1")
        exce


SyntaxError: expected 'except' or 'finally' block (3596940498.py, line 196)

In [5]:
#!/usr/bin/env python3
"""
Production-ready script to generate wordclouds from a cultural artifacts dataset.

Usage (CLI):
    python wordcloud_generator_for_cultural_artifacts.py \
        --input "/path/to/cultural-artifact-gold-sheet-final.csv" \
        --output_dir "/path/to/output/dir" \
        --identifier_cols Identifier1 Identifier2

In Jupyter/interactive sessions the script will ignore kernel-injected args and:
 - auto-use the first CSV in cwd if present, OR
 - create a small sample CSV and run on it for demo.

Requirements:
    pip install pandas spacy wordcloud matplotlib nltk tqdm
    python -m spacy download en_core_web_sm
"""

import argparse
import os
import re
import sys
import logging
import glob
from collections import Counter

import pandas as pd
from wordcloud import WordCloud
import matplotlib
# use non-interactive backend so script works headless
matplotlib.use("Agg")
import matplotlib.pyplot as plt

import spacy
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# --------------------------- Helper utilities ---------------------------

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

def ensure_nltk_stopwords():
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        logging.info("Downloading NLTK stopwords...")
        nltk.download("stopwords")

def load_spacy_model(model_name="en_core_web_sm"):
    try:
        nlp = spacy.load(model_name)
    except OSError:
        logging.info(f"spaCy model {model_name} not found. Attempting to download...")
        from spacy.cli import download
        download(model_name)
        nlp = spacy.load(model_name)
    return nlp

def sanitize_text(text: str) -> str:
    if pd.isna(text):
        return ""
    return re.sub(r"\s+", " ", str(text)).strip()

def detect_candidate_columns(df: pd.DataFrame):
    """Detect artifact column and identifier columns if not explicitly provided."""
    col_lower = {c.lower(): c for c in df.columns}

    artifact_candidates = [
        "unique artifact",
        "cultural artifact",
        "artifact",
        "unique_artifact",
        "cultural artifact",
    ]
    artifact_col = None
    for cand in artifact_candidates:
        if cand in col_lower:
            artifact_col = col_lower[cand]
            break

    # identifier columns: any column whose name starts with 'identifier' (case-insensitive)
    identifier_cols = [c for c in df.columns if c.lower().startswith("identifier")]

    # fallback: look for numbered identifier columns
    if not identifier_cols:
        for i in range(1, 6):
            name = f"identifier{i}"
            if name in col_lower:
                identifier_cols.append(col_lower[name])

    # final fallback: pick columns likely to contain short descriptor text
    if not artifact_col:
        possible = [c for c in df.columns if "artifact" in c.lower() or "unique" in c.lower()]
        artifact_col = possible[0] if possible else df.columns[0]

    return artifact_col, identifier_cols

# --------------------------- NER + cleaning ---------------------------

def extract_named_entities_from_text(nlp, text, accepted_labels=None):
    if not text:
        return []
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if accepted_labels:
            if ent.label_ in accepted_labels:
                entities.append(ent.text)
        else:
            if ent.label_ not in {"DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"}:
                entities.append(ent.text)
    return entities

def clean_entity_token(token: str, stop_words_set, exclude_names_set):
    t = token.strip()
    t = re.sub(r"^[\W_]+|[\W_]+$", "", t)
    if not t:
        return None
    if re.fullmatch(r"\d+", t):
        return None
    if len(t) <= 1:
        return None
    low = t.lower()
    if low in stop_words_set:
        return None
    if low in exclude_names_set:
        return None
    if re.match(r"^identifier\d*$", low):
        return None
    return t

# --------------------------- Wordcloud + plotting ---------------------------

def save_wordcloud_from_freq(freq_dict, title, output_path, width=1600, height=800):
    if not freq_dict:
        logging.warning(f"No data to generate wordcloud for {title}")
        return None

    wc = WordCloud(width=width, height=height, background_color="white", collocations=False)
    wc.generate_from_frequencies(freq_dict)
    plt.figure(figsize=(width / 200, height / 200))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=18)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    plt.tight_layout()
    plt.savefig(output_path, dpi=150)
    plt.close()
    logging.info(f"Saved wordcloud: {output_path}")
    return output_path

# --------------------------- Main processing ---------------------------

def process_file(
    input_csv,
    output_dir,
    identifier_cols=None,
    artifact_col=None,
    exclude_names=None,
    accepted_entity_labels=None,
    top_k=500,
):
    setup_logging()
    ensure_nltk_stopwords()

    stop_words_set = set(stopwords.words("english"))

    # Default exclude names (add more if needed)
    default_exclude = {
        "anjali",
        "shivani",
        "shampy",
        "sahili",
        "shivangi",
        "anjalisingh",
    }
    exclude_names_set = (set(n.strip().lower() for n in exclude_names) | default_exclude) if exclude_names else default_exclude

    logging.info("Loading spaCy model...")
    nlp = load_spacy_model()

    logging.info(f"Reading CSV file: {input_csv}")
    try:
        df = pd.read_csv(input_csv)
    except Exception as e:
        logging.warning(f"Default read failed: {e}. Trying ISO-8859-1...")
        try:
            df = pd.read_csv(input_csv, encoding="ISO-8859-1")
        except Exception as e2:
            logging.error(f"Failed to read CSV: {e2}")
            raise

    original_columns = list(df.columns)
    logging.info(f"Columns detected: {original_columns}")

    detected_artifact_col, detected_identifier_cols = detect_candidate_columns(df)
    artifact_col = artifact_col or detected_artifact_col
    if identifier_cols:
        identifier_cols = [c for c in identifier_cols if c in df.columns]
    else:
        identifier_cols = detected_identifier_cols

    logging.info(f"Using artifact column: {artifact_col}")
    logging.info(f"Using identifier columns: {identifier_cols}")

    os.makedirs(output_dir, exist_ok=True)

    # --- Unique artifacts word frequencies ---
    logging.info("Computing unique artifacts frequencies...")
    artifacts = df[artifact_col].dropna().astype(str).apply(sanitize_text)
    artifact_freq = Counter()
    artifact_unique_set = set()
    for art in artifacts:
        if not art:
            continue
        artifact_freq[art] += 1
        artifact_unique_set.add(art)

    artifact_token_freq = Counter()
    artifact_whole_freq = Counter()
    for art, cnt in artifact_freq.items():
        artifact_whole_freq[art.replace(" ", "_")] += cnt
        for token in re.split(r"[\s,/|;-]+", art):
            t = token.strip()
            tclean = clean_entity_token(t, stop_words_set, exclude_names_set)
            if tclean:
                artifact_token_freq[tclean] += cnt

    # --- Identifier columns: collect NERs across those columns ---
    logging.info("Extracting named entities from identifier columns...")
    all_ident_text = []
    for col in identifier_cols:
        if col not in df.columns:
            continue
        text_series = df[col].fillna("").astype(str).apply(sanitize_text)
        all_ident_text.extend(text_series.tolist())

    combined_texts = []
    chunk_size = 200
    for i in range(0, len(all_ident_text), chunk_size):
        combined_texts.append(" ".join(all_ident_text[i : i + chunk_size]))

    entity_counter = Counter()
    for chunk in tqdm(combined_texts, desc="spaCy NER chunks"):
        ents = extract_named_entities_from_text(nlp, chunk, accepted_labels=accepted_entity_labels)
        for ent in ents:
            cleaned_phrase_tokens = []
            for token in re.split(r"[\s,/|;:-]+", ent):
                cleaned = clean_entity_token(token, stop_words_set, exclude_names_set)
                if cleaned:
                    cleaned_phrase_tokens.append(cleaned)
            if cleaned_phrase_tokens:
                phrase = " ".join(cleaned_phrase_tokens)
                entity_counter[phrase] += 1
                for t in cleaned_phrase_tokens:
                    entity_counter[t] += 1

    for bad in ["identifier", "identifiers"]:
        if bad in entity_counter:
            del entity_counter[bad]

    most_common_entities = dict(entity_counter.most_common(top_k))

    combined_freq = Counter()
    combined_freq.update(artifact_token_freq)
    combined_freq.update(most_common_entities)

    # ---------------- Save outputs ----------------
    logging.info("Generating wordclouds and saving results...")
    artifacts_wordcloud_path = os.path.join(output_dir, "artifacts_whole_wordcloud.png")
    save_wordcloud_from_freq(dict(artifact_whole_freq.most_common(1000)), "Artifacts (whole names)", artifacts_wordcloud_path)

    artifact_token_wordcloud_path = os.path.join(output_dir, "artifacts_tokens_wordcloud.png")
    save_wordcloud_from_freq(dict(artifact_token_freq.most_common(1000)), "Artifacts (tokens)", artifact_token_wordcloud_path)

    entities_wordcloud_path = os.path.join(output_dir, "identifiers_named_entities_wordcloud.png")
    save_wordcloud_from_freq(most_common_entities, "Identifier Named Entities", entities_wordcloud_path)

    combined_wordcloud_path = os.path.join(output_dir, "combined_artifacts_entities_wordcloud.png")
    save_wordcloud_from_freq(dict(combined_freq.most_common(1500)), "Combined Artifacts + Entities", combined_wordcloud_path)

    logging.info("Saving frequency CSVs...")
    pd.DataFrame(artifact_freq.most_common(), columns=["artifact_name", "count"]).to_csv(
        os.path.join(output_dir, "artifact_freq.csv"), index=False
    )
    pd.DataFrame(most_common_entities.items(), columns=["entity", "count"]).to_csv(
        os.path.join(output_dir, "identifier_entity_freq.csv"), index=False
    )
    pd.DataFrame(combined_freq.most_common(), columns=["token", "count"]).to_csv(
        os.path.join(output_dir, "combined_token_freq.csv"), index=False
    )

    logging.info("Processing complete. Outputs saved to: %s", os.path.abspath(output_dir))
    return {
        "artifact_whole_wordcloud": artifacts_wordcloud_path,
        "artifact_token_wordcloud": artifact_token_wordcloud_path,
        "entities_wordcloud": entities_wordcloud_path,
        "combined_wordcloud": combined_wordcloud_path,
        "artifact_freq_csv": os.path.join(output_dir, "artifact_freq.csv"),
        "entity_freq_csv": os.path.join(output_dir, "identifier_entity_freq.csv"),
    }

# --------------------------- CLI ---------------------------

def parse_args():
    p = argparse.ArgumentParser(description="Generate wordclouds from cultural artifact CSV.")
    p.add_argument("--input", "-i", help="Input CSV file path")
    p.add_argument("--output_dir", "-o", help="Output directory to save wordclouds and CSVs")
    p.add_argument(
        "--identifier_cols",
        "-id",
        nargs="*",
        help="Identifier column names (space separated). If omitted, script auto-detects columns starting with 'Identifier'",
    )
    p.add_argument(
        "--artifact_col",
        "-a",
        help="Artifact column name. If omitted, script will attempt to auto-detect common artifact column names.",
    )
    p.add_argument(
        "--exclude_names",
        "-e",
        nargs="*",
        help="List of names to exclude from entity tokens (space separated).",
    )
    p.add_argument(
        "--top_k",
        type=int,
        default=500,
        help="How many top entities to keep for generating wordclouds (default: 500)",
    )
    # Use parse_known_args to avoid crash from extra kernel args in interactive environments
    args, unknown = p.parse_known_args()
    if unknown:
        logging.getLogger().info(f"Ignoring unknown CLI args: {unknown}")
    return args

def create_sample_csv(path):
    logging.info("Creating a tiny sample CSV for demo purposes at: %s", path)
    sample = pd.DataFrame(
        {
            "Unique Artifact": [
                "Terracotta horse",
                "Bronze bell",
                "Stone sculpture of Vishnu",
                "Handloom sari",
                "Batik textile",
            ],
            "Identifier1": [
                "Patna district, Bihar",
                "Pataliputra museum",
                "8th century temple",
                "Weaver: Anjali Singh",
                "Maker: Shivani Rao",
            ],
            "Identifier2": [
                "Terracotta",
                "Bronze",
                "Stone",
                "Textile",
                "Textile",
            ],
        }
    )
    sample.to_csv(path, index=False)
    return path

def main():
    setup_logging()
    args = parse_args()

    # detect interactive environment (Jupyter/IPython)
    is_interactive = False
    try:
        is_interactive = "get_ipython" in globals()
    except Exception:
        is_interactive = False

    input_csv = args.input
    output_dir = args.output_dir or "./wordcloud_output"
    identifier_cols = args.identifier_cols
    artifact_col = args.artifact_col
    exclude_names = args.exclude_names
    top_k = args.top_k

    # If no input provided, try to find a CSV in cwd (useful for notebook runs)
    if not input_csv:
        csvs = glob.glob("*.csv")
        if csvs:
            input_csv = csvs[0]
            logging.info("No --input provided. Auto-using first CSV in cwd: %s", input_csv)
        elif is_interactive:
            sample_path = os.path.join(os.getcwd(), "sample_cultural_artifacts_demo.csv")
            input_csv = create_sample_csv(sample_path)
            logging.info("No CSV found in cwd; using generated sample CSV: %s", input_csv)
        else:
            logging.error("No --input provided and not in interactive mode. Please provide --input and --output_dir.")
            print("Example CLI usage:\n  python script.py --input /path/to/file.csv --output_dir /path/to/outdir")
            sys.exit(2)

    if not output_dir:
        output_dir = "./wordcloud_output"
        logging.info("No --output_dir provided; defaulting to %s", output_dir)

    # Run processing
    result = process_file(
        input_csv=input_csv,
        output_dir=output_dir,
        identifier_cols=identifier_cols,
        artifact_col=artifact_col,
        exclude_names=exclude_names,
        top_k=top_k,
    )

    logging.info("Finished. Result paths: %s", result)

if __name__ == "__main__":
    main()


2025-10-06 01:19:44,875 - INFO - Ignoring unknown CLI args: ['--f=/Users/anjalisingh/Library/Jupyter/runtime/kernel-v3c837de481c8781c0b020b95ef94903aa168f27d4.json']
2025-10-06 01:19:44,877 - INFO - No --input provided. Auto-using first CSV in cwd: Full Dataset - cultural-artifact-gold-sheet-final.csv
2025-10-06 01:19:44,882 - INFO - Loading spaCy model...
2025-10-06 01:19:45,414 - INFO - Reading CSV file: Full Dataset - cultural-artifact-gold-sheet-final.csv
2025-10-06 01:19:45,457 - INFO - Columns detected: ['Attribute', 'unique artifact', 'Specific Location', 'state', 'Identifier1', 'Identifier2', 'Identifier3', 'Identifier4', 'Old Identifier', 'Influence Locations', 'Image Link1', 'Image Link2', 'Unnamed: 12', 'Unnamed: 13', '{Identifier} famous throughout India, originated in {specific location}', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Attribute.1', 'state.1', 'Unnamed: 

spaCy NER chunks: 100%|██████████| 19/19 [00:08<00:00,  2.26it/s]

2025-10-06 01:19:53,919 - INFO - Generating wordclouds and saving results...





2025-10-06 01:19:56,148 - INFO - Saved wordcloud: ./wordcloud_output/artifacts_whole_wordcloud.png
2025-10-06 01:19:58,137 - INFO - Saved wordcloud: ./wordcloud_output/artifacts_tokens_wordcloud.png
2025-10-06 01:20:00,107 - INFO - Saved wordcloud: ./wordcloud_output/identifiers_named_entities_wordcloud.png
2025-10-06 01:20:02,126 - INFO - Saved wordcloud: ./wordcloud_output/combined_artifacts_entities_wordcloud.png
2025-10-06 01:20:02,126 - INFO - Saving frequency CSVs...
2025-10-06 01:20:02,135 - INFO - Processing complete. Outputs saved to: /Users/anjalisingh/Desktop/IITP/WordCloud/wordcloud_output
2025-10-06 01:20:02,139 - INFO - Finished. Result paths: {'artifact_whole_wordcloud': './wordcloud_output/artifacts_whole_wordcloud.png', 'artifact_token_wordcloud': './wordcloud_output/artifacts_tokens_wordcloud.png', 'entities_wordcloud': './wordcloud_output/identifiers_named_entities_wordcloud.png', 'combined_wordcloud': './wordcloud_output/combined_artifacts_entities_wordcloud.png', '