In [1]:
#!/usr/bin/env python3
"""
state_attribute_counts.py
Production-ready script to compute question counts per state per attribute
from CSVs containing graph_path data.

Usage:
  python state_attribute_counts.py --input "/path/Full Dataset - Final Dataset.csv" --outdir "/path/output"
"""
import argparse
import logging
import os
import re
import sys
from typing import List, Optional, Set

import pandas as pd

# ------------------------
# Logging config
# ------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)


# ------------------------
# Helper functions
# ------------------------
def safe_read_csv(path: str) -> Optional[pd.DataFrame]:
    """Read CSV with fallbacks for common encoding/engine issues."""
    try:
        df = pd.read_csv(path)
        logger.info("Loaded %d rows from %s", len(df), path)
        return df
    except Exception as e1:
        logger.warning("pd.read_csv failed: %s — trying engine='python' with utf-8", e1)
        try:
            df = pd.read_csv(path, engine="python", encoding="utf-8", error_bad_lines=False)  # type: ignore
            logger.info("Loaded %d rows (fallback) from %s", len(df), path)
            return df
        except Exception as e2:
            logger.error("Failed to read %s: %s", path, e2)
            return None


def find_graph_columns(df: pd.DataFrame) -> List[str]:
    """Return list of columns that look like graph_path or graph_path/0 etc."""
    cols = df.columns.tolist()
    candidates = [c for c in cols if re.search(r'graph', c, flags=re.I)]
    # Prefer single 'graph_path' if exists
    if any(c.lower() == "graph_path" for c in candidates):
        return ["graph_path"]
    # Otherwise return all graph-like columns (graph_path/0, graph_path/1, ...)
    return sorted(candidates)


def merge_graph_columns(df: pd.DataFrame, graph_cols: List[str]) -> pd.Series:
    """Merge multiple graph columns into a single combined_graph string per row."""
    if not graph_cols:
        # no graph columns found
        return pd.Series([""] * len(df), index=df.index)

    if len(graph_cols) == 1:
        return df[graph_cols[0]].astype(str).fillna("")

    # join non-null, non-empty parts with '|' (preserve token separators)
    def join_row_parts(row):
        parts = []
        for c in graph_cols:
            v = row.get(c)
            if pd.isna(v):
                continue
            s = str(v).strip()
            if s:
                parts.append(s)
        return "|".join(parts)

    return df.apply(join_row_parts, axis=1)


def extract_state_from_graph(text: str) -> Optional[str]:
    """Try to extract the state token from combined graph string."""
    if not isinstance(text, str) or not text:
        return None
    # Look for state:VALUE tokens (allow letters, numbers, spaces, underscore, hyphen)
    m = re.search(r'state:([A-Za-z0-9_ \-]+)', text, flags=re.I)
    if m:
        # Normalize: strip and title-case
        val = m.group(1).strip()
        return val.title()
    return None


def detect_attributes_from_graphs(series: pd.Series) -> Set[str]:
    """Scan combined_graph series and return set of attribute keys (left of ':')."""
    attrs = set()
    for text in series.dropna().astype(str):
        # split by '|' or ';'
        tokens = re.split(r'\||;', text)
        for t in tokens:
            t = t.strip()
            if not t:
                continue
            if ':' in t:
                key = t.split(':', 1)[0].strip().lower()
                if key and key != "state":
                    attrs.add(key)
    return attrs


def make_attribute_flags(df: pd.DataFrame, combined_graph_col: str, attributes: List[str]) -> pd.DataFrame:
    """Create 0/1 columns in df for each attribute, based on presence in combined_graph_col."""
    for att in attributes:
        # pattern: word boundary then attribute name then colon
        pattern = rf'(?i)\b{re.escape(att)}:'
        df[att] = df[combined_graph_col].astype(str).str.contains(pattern, regex=True, na=False).astype(int)
    return df


# ------------------------
# Core processing
# ------------------------
def process_file(input_path: str, outdir: str, dedup_col: Optional[str] = "Corrected Question"):
    logger.info("Processing file: %s", input_path)
    df = safe_read_csv(input_path)
    if df is None:
        raise RuntimeError(f"Failed to load {input_path}")

    # detect/merge graph columns
    graph_cols = find_graph_columns(df)
    if not graph_cols:
        logger.error("No graph-like column found in %s. Please ensure your file has a 'graph_path' or similar column.", input_path)
        raise RuntimeError("Missing graph columns")

    logger.info("Found graph columns: %s", graph_cols)
    df["__combined_graph"] = merge_graph_columns(df, graph_cols)

    # extract state: prefer extracted state from graph_path; if not found, try existing 'state' column
    df["__state_extracted"] = df["__combined_graph"].apply(extract_state_from_graph)

    # if there's an explicit 'state' column, use it as fallback
    state_col_candidates = [c for c in df.columns if c.lower().strip() == "state"]
    if state_col_candidates:
        used_state_col = state_col_candidates[0]
        logger.info("Found explicit state column: %s (will be used as fallback)", used_state_col)
        df["__state_fallback"] = df[used_state_col].astype(str).str.strip().replace({"nan": ""})
    else:
        df["__state_fallback"] = ""

    # final state: extracted if available, else fallback, else "UNKNOWN"
    def pick_state(row):
        if row["__state_extracted"]:
            return row["__state_extracted"]
        if row["__state_fallback"]:
            return row["__state_fallback"].title()
        return "UNKNOWN"

    df["state_final"] = df.apply(pick_state, axis=1)

    # detect attributes automatically from the graph tokens
    detected_attrs = detect_attributes_from_graphs(df["__combined_graph"])
    if not detected_attrs:
        logger.warning("No attributes detected automatically. The file may not be formatted as expected.")
    attributes = sorted(list(detected_attrs))
    logger.info("Auto-detected attributes (%d): %s", len(attributes), attributes)

    if not attributes:
        # final fallback to common cultural attributes if nothing detected
        attributes = ["tourism", "history", "art", "festival", "cuisine", "personalities", "costume"]
        logger.info("Falling back to default attributes: %s", attributes)

    # create flags
    df = make_attribute_flags(df, "__combined_graph", attributes)

    # RAW counts (row-level) grouped by state
    group = df.groupby("state_final")[attributes].sum().reset_index().rename(columns={"state_final": "state"})
    # compute total per state
    group["Total_Questions"] = group[attributes].sum(axis=1).astype(int)

    # save raw counts
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    raw_out = os.path.join(outdir, f"{base_name}_state_attribute_counts_raw.csv")
    group.to_csv(raw_out, index=False)
    logger.info("Saved raw state-attribute counts to %s", raw_out)

    # UNIQUE counts by Corrected Question if available
    unique_out = None
    if dedup_col in df.columns:
        logger.info("Deduplicating by column: %s", dedup_col)
        df_unique = df.dropna(subset=[dedup_col]).drop_duplicates(subset=[dedup_col])
        group_unique = df_unique.groupby("state_final")[attributes].sum().reset_index().rename(columns={"state_final": "state"})
        group_unique["Total_Questions"] = group_unique[attributes].sum(axis=1).astype(int)
        unique_out = os.path.join(outdir, f"{base_name}_state_attribute_counts_unique.csv")
        group_unique.to_csv(unique_out, index=False)
        logger.info("Saved unique-question state-attribute counts to %s", unique_out)
    else:
        logger.info("Deduplication column '%s' not found; skipping unique-question counts.", dedup_col)

    # Also save raw counts with percentage columns (attr_pct per state)
    pct_df = group.copy()
    for att in attributes:
        pct_df[f"{att}_pct"] = (pct_df[att] / pct_df["Total_Questions"].replace({0: 1})) * 100
    pct_out = os.path.join(outdir, f"{base_name}_state_attribute_counts_with_percent.csv")
    pct_df.to_csv(pct_out, index=False)
    logger.info("Saved percent-augmented file to %s", pct_out)

    # Return file paths for further use
    return {"raw": raw_out, "unique": unique_out, "percent": pct_out, "attributes": attributes}


# ------------------------
# CLI main
# ------------------------
def main(argv: Optional[List[str]] = None):
    parser = argparse.ArgumentParser(description="Compute question counts per state per attribute from CSVs with graph_path.")
    parser.add_argument("--input", "-i", nargs="+", required=True, help="Input CSV file(s).")
    parser.add_argument("--outdir", "-o", required=True, help="Output directory for CSV summary files.")
    parser.add_argument("--dedup-col", "-d", default="Corrected Question", help="Column to use for unique-question deduplication.")
    args = parser.parse_args(argv)

    outdir = args.outdir
    os.makedirs(outdir, exist_ok=True)

    results = {}
    for input_path in args.input:
        if not os.path.isfile(input_path):
            logger.error("Input file not found: %s", input_path)
            continue
        try:
            info = process_file(input_path, outdir, dedup_col=args.dedup_col)
            results[input_path] = info
        except Exception as e:
            logger.exception("Failed processing %s: %s", input_path, e)

    logger.info("DONE. Generated outputs for %d files.", len(results))
    if not results:
        logger.error("No outputs generated. Check input file paths and format.")
        sys.exit(2)


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --input INPUT [INPUT ...] --outdir OUTDIR
                             [--dedup-col DEDUP_COL]
ipykernel_launcher.py: error: the following arguments are required: --input/-i, --outdir/-o


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
# ---------------------------------------------
# ✅ Simplified Direct-Run Version (No Arguments Needed)
# ---------------------------------------------

import pandas as pd
import os

# ✅ Set your paths
input_files = [
    "/Users/anjalisingh/Desktop/IITP/QUESTION_COUNT/Full Dataset - 3_hop_questions.csv",
    "/Users/anjalisingh/Desktop/IITP/QUESTION_COUNT/Full Dataset - Final Dataset.csv"
]
output_folder = "/Users/anjalisingh/Desktop/IITP/QUESTION_COUNT/state_attribute_outputs"
os.makedirs(output_folder, exist_ok=True)

attributes = ["tourism", "art", "history", "festival", "architecture", "dance", 
              "music", "literature", "cuisine", "heritage", "language", "religion", 
              "craft", "handicraft", "costume"]

def process_dataset(file_path):
    print(f"\n🔹 Processing: {os.path.basename(file_path)}")
    df = pd.read_csv(file_path)

    # Detect graph_path-like columns
    graph_col = [c for c in df.columns if "graph" in c.lower()][0]
    df["state"] = df[graph_col].astype(str).str.extract(r"state:([a-zA-Z_]+)").fillna("UNKNOWN")
    df["state"] = df["state"].str.title()

    for att in attributes:
        df[att] = df[graph_col].astype(str).str.contains(f"{att}:", case=False, na=False).astype(int)

    result = df.groupby("state")[attributes].sum().reset_index()
    result["Total_Questions"] = result[attributes].sum(axis=1)

    out_path = os.path.join(output_folder, f"{os.path.basename(file_path).split('.')[0]}_state_attribute_summary.csv")
    result.to_csv(out_path, index=False)
    print(f"✅ Saved → {out_path}")
    print(result.head())

# Run for both files
for file_path in input_files:
    process_dataset(file_path)



🔹 Processing: Full Dataset - 3_hop_questions.csv
✅ Saved → /Users/anjalisingh/Desktop/IITP/QUESTION_COUNT/state_attribute_outputs/Full Dataset - 3_hop_questions_state_attribute_summary.csv
     state  tourism  art  history  festival  architecture  dance  music  \
0  Unknown       38   15       34         4             0      0      0   

   literature  cuisine  heritage  language  religion  craft  handicraft  \
0           0        4         0         1         9      0           0   

   costume  Total_Questions  
0        2              107  

🔹 Processing: Full Dataset - Final Dataset.csv
✅ Saved → /Users/anjalisingh/Desktop/IITP/QUESTION_COUNT/state_attribute_outputs/Full Dataset - Final Dataset_state_attribute_summary.csv
               state  tourism  art  history  festival  architecture  dance  \
0    Andaman_Nicobar        1    0        0         0             0      0   
1     Andhra_Pradesh       35   84       28        27             0      0   
2  Arunachal_Pradesh       6

In [3]:
#!/usr/bin/env python3
"""
state_attribute_counts_fixed.py
Improved production-ready script to compute question counts per state per attribute
from CSVs containing graph_path data.

Usage:
  python state_attribute_counts_fixed.py --input "/path/Full Dataset.csv" --outdir "/path/output"
"""
import argparse
import logging
import os
import re
import sys
from typing import List, Optional, Set

import pandas as pd

# ------------------------
# Logging config
# ------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

# ------------------------
# Helper functions
# ------------------------
def safe_read_csv(path: str) -> Optional[pd.DataFrame]:
    """Read CSV with fallbacks for common encoding/engine issues."""
    try:
        df = pd.read_csv(path)
        logger.info("Loaded %d rows from %s", len(df), path)
        return df
    except Exception as e1:
        logger.warning("pd.read_csv failed: %s — trying engine='python' with utf-8-sig", e1)
        try:
            df = pd.read_csv(path, engine="python", encoding="utf-8-sig", on_bad_lines='skip')
            logger.info("Loaded %d rows (fallback) from %s", len(df), path)
            return df
        except Exception as e2:
            logger.error("Failed to read %s: %s", path, e2)
            return None


def find_graph_columns(df: pd.DataFrame) -> List[str]:
    """Return list of columns that look like graph_path or graph_path/0 etc."""
    cols = df.columns.tolist()
    candidates = [c for c in cols if re.search(r'graph|path', c, flags=re.I)]
    if any(c.lower() == "graph_path" for c in candidates):
        return ["graph_path"]
    return sorted(candidates)


def merge_graph_columns(df: pd.DataFrame, graph_cols: List[str]) -> pd.Series:
    """Merge multiple graph columns into a single combined_graph string per row."""
    if not graph_cols:
        return pd.Series([""] * len(df), index=df.index)
    if len(graph_cols) == 1:
        return df[graph_cols[0]].astype(str).fillna("")
    
    def join_row_parts(row):
        parts = []
        for c in graph_cols:
            v = row.get(c)
            if pd.isna(v):
                continue
            s = str(v).strip()
            if s:
                parts.append(s)
        return "|".join(parts)

    return df.apply(join_row_parts, axis=1)


def extract_state_from_graph(text: str) -> Optional[str]:
    """Try to extract the state token from combined graph string."""
    if not isinstance(text, str) or not text:
        return None
    m = re.search(r'state[:=]([A-Za-z0-9_ \-]+)', text, flags=re.I)
    if m:
        return m.group(1).strip().title()
    return None


def detect_attributes_from_graphs(series: pd.Series) -> Set[str]:
    """Scan combined_graph series and return set of attribute keys (left of ':')."""
    attrs = set()
    for text in series.dropna().astype(str):
        tokens = re.split(r'\||;', text)
        for t in tokens:
            t = t.strip()
            if not t or ':' not in t:
                continue
            key = t.split(':', 1)[0].strip().lower()
            if key and key != "state":
                attrs.add(key)
    return attrs


def make_attribute_flags(df: pd.DataFrame, combined_graph_col: str, attributes: List[str]) -> pd.DataFrame:
    """Create 0/1 columns in df for each attribute."""
    for att in attributes:
        pattern = rf'(?i)\b{re.escape(att)}:'
        df[att] = df[combined_graph_col].astype(str).str.contains(pattern, regex=True, na=False).astype(int)
    return df


# ------------------------
# Core processing
# ------------------------
def process_file(input_path: str, outdir: str, dedup_col: Optional[str] = "Corrected Question"):
    logger.info("Processing file: %s", input_path)
    df = safe_read_csv(input_path)
    if df is None:
        raise RuntimeError(f"Failed to load {input_path}")

    graph_cols = find_graph_columns(df)
    if not graph_cols:
        raise RuntimeError(f"No graph-like column found in {input_path}")

    logger.info("Found graph columns: %s", graph_cols)
    df["__combined_graph"] = merge_graph_columns(df, graph_cols)

    # Extract state
    df["__state_extracted"] = df["__combined_graph"].apply(extract_state_from_graph)
    state_col_candidates = [c for c in df.columns if c.lower().strip() == "state"]
    df["__state_fallback"] = df[state_col_candidates[0]].astype(str).str.strip().replace({"nan": ""}) if state_col_candidates else ""

    def pick_state(row):
        if row["__state_extracted"]:
            return row["__state_extracted"]
        if row["__state_fallback"]:
            return row["__state_fallback"].title()
        return "UNKNOWN"

    df["state_final"] = df.apply(pick_state, axis=1)

    # Detect attributes
    detected_attrs = detect_attributes_from_graphs(df["__combined_graph"])
    attributes = sorted(list(detected_attrs)) if detected_attrs else ["tourism", "history", "art", "festival", "cuisine", "personalities", "costume"]
    logger.info("Using attributes: %s", attributes)

    # Create flags
    df = make_attribute_flags(df, "__combined_graph", attributes)

    # Raw counts grouped by state
    group = df.groupby("state_final")[attributes].sum().reset_index().rename(columns={"state_final": "state"})
    group["Total_Questions"] = group[attributes].sum(axis=1).astype(int)

    # Save raw counts
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    raw_out = os.path.join(outdir, f"{base_name}_state_attribute_counts_raw.csv")
    group.to_csv(raw_out, index=False)
    logger.info("Saved raw state-attribute counts to %s", raw_out)

    # Unique counts if dedup_col exists
    unique_out = None
    if dedup_col in df.columns:
        df_unique = df.dropna(subset=[dedup_col]).drop_duplicates(subset=[dedup_col])
        group_unique = df_unique.groupby("state_final")[attributes].sum().reset_index().rename(columns={"state_final": "state"})
        group_unique["Total_Questions"] = group_unique[attributes].sum(axis=1).astype(int)
        unique_out = os.path.join(outdir, f"{base_name}_state_attribute_counts_unique.csv")
        group_unique.to_csv(unique_out, index=False)
        logger.info("Saved unique-question counts to %s", unique_out)
    else:
        logger.info("Deduplication column '%s' not found; skipping unique counts.", dedup_col)

    # Save percent-augmented file
    pct_df = group.copy()
    for att in attributes:
        pct_df[f"{att}_pct"] = (pct_df[att] / pct_df["Total_Questions"].replace({0: 1})) * 100
    pct_out = os.path.join(outdir, f"{base_name}_state_attribute_counts_with_percent.csv")
    pct_df.to_csv(pct_out, index=False)
    logger.info("Saved percent-augmented file to %s", pct_out)

    return {"raw": raw_out, "unique": unique_out, "percent": pct_out, "attributes": attributes}


# ------------------------
# CLI main
# ------------------------
def main(argv: Optional[List[str]] = None):
    parser = argparse.ArgumentParser(description="Compute question counts per state per attribute from CSVs with graph_path.")
    parser.add_argument("--input", "-i", nargs="+", required=True, help="Input CSV file(s).")
    parser.add_argument("--outdir", "-o", required=True, help="Output directory for CSV summary files.")
    parser.add_argument("--dedup-col", "-d", default="Corrected Question", help="Column to use for unique-question deduplication.")
    args = parser.parse_args(argv)

    outdir = args.outdir
    os.makedirs(outdir, exist_ok=True)

    results = {}
    for input_path in args.input:
        if not os.path.isfile(input_path):
            logger.error("Input file not found: %s", input_path)
            continue
        try:
            info = process_file(input_path, outdir, dedup_col=args.dedup_col)
            results[input_path] = info
        except Exception as e:
            logger.exception("Failed processing %s: %s", input_path, e)

    logger.info("DONE. Generated outputs for %d files.", len(results))
    if not results:
        logger.error("No outputs generated. Check input file paths and format.")
        sys.exit(2)


if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --input INPUT [INPUT ...] --outdir OUTDIR
                             [--dedup-col DEDUP_COL]
ipykernel_launcher.py: error: the following arguments are required: --input/-i, --outdir/-o


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
