In [None]:
# Core imports for file system operations, regex, JSON logging, text normalization,
# dataframe manipulation, numeric utilities, type hints, and file path handling.
import os
import re
import json
import unicodedata
import pandas as pd
import numpy as np
from typing import Optional, Tuple, List, Dict
from pathlib import Path

In [None]:
# List of all raw datasets to attempt loading.
# The script automatically scans these, extracts text + label columns, standardizes them,
# deduplicates, merges, and optionally splits into train/val/test.
INPUT_FILES = [
    "500_anonymized_Reddit_users_posts_labels.csv",
    "Suicide_Detection.csv",
    "test.csv",
    "testset.csv",
    "Train_suicide1.csv",
    "twitter-suicidal_data.csv"
]

DO_SPLITS = True

OUTPUT_COMBINED = "combined_suicide_ideation_dataset.csv"
TRAIN_OUT = "combined_train.csv"
VAL_OUT   = "combined_val.csv"
TEST_OUT  = "combined_test.csv"

# Ensures reproducible train/test split order
RANDOM_SEED = 42

In [None]:
# Known patterns for identifying "text" columns in different datasets.
TEXT_NAME_HINTS = [
    "text","post","body","content","comment","tweet","message","clean_text","sentence","statement",
    "post_text","postbody","title","headline","selftext","status"
]

# Known indicators for label column names across datasets.
LABEL_NAME_HINTS = [
    "label","class","target","is_suicide","is_suicidal","suicide","suicidal","ideation","is_ideation",
    "category","y","output","sentiment","tag"
]

# Tokens that strongly indicate a NON-suicidal label (e.g., control/neutral samples).
NON_IDEATION_TOKENS = [
    "non-suicidal","nonsuicidal","non suicidal","non_suicidal","no suicide","control","neutral","others","other"
]

# Noise & filler patterns to remove from text (common in web-scraped data)
filler_patterns = [
    r'\bfiller\b', 
    r'www\s*youtube\s*com',
    r'www\s*reddit\s*com',
    r'https?\s*www\s*reddit',
    r'https?\s*www\s*youtube',
    r'amp\s*x200b\s*amp',
    r'x200b\s*amp\s*x200b',
    r'\bcake\b'
]

filler_regex = re.compile('|'.join(filler_patterns), flags=re.IGNORECASE)

In [None]:
def robust_read_csv(path: str) -> Optional[pd.DataFrame]:
    """
    Try multiple strategies to load a CSV cleanly.
    Handles weird encodings, separators, and bad lines.
    Returns a DataFrame or None if all attempts fail.
    """
    attempts = [
        dict(sep=None, engine="python", encoding="utf-8", on_bad_lines="skip"),
        dict(sep=None, engine="python", encoding="utf-8-sig", on_bad_lines="skip"),
        dict(sep=None, engine="python", encoding="latin1", on_bad_lines="skip"),
        dict(sep=",", encoding="utf-8", on_bad_lines="skip"),
        dict(sep=",", encoding="latin1", on_bad_lines="skip"),
        dict(sep="\t", encoding="utf-8", on_bad_lines="skip"),
        dict(sep=";", encoding="utf-8", on_bad_lines="skip"),
    ]
    last_err = None
    for kw in attempts:
        try:
            df = pd.read_csv(path, **kw)
            df.columns = [str(c).strip() for c in df.columns]
            return df
        except Exception as e:
            last_err = e
    print(f"[WARN] Could not read {path}: {repr(last_err)}")
    return None

def pick_text_columns(df: pd.DataFrame) -> List[str]:
    """
    Heuristically pick which columns contain raw text.
    Looks at:
        - string dtype
        - name similarity to known text column hints
        - average length of text values
    Returns a list of likely text columns.
    """
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    scored = []
    for c in obj_cols:
        name = c.lower()
        name_score = sum(1 for h in TEXT_NAME_HINTS if h in name)
        try:
            avg_len = df[c].dropna().astype(str).map(len).mean()
        except Exception:
            avg_len = 0
        scored.append((c, name_score, avg_len))
    scored.sort(key=lambda x: (x[1], x[2]), reverse=True)
    picked = []
    if scored:
        picked.append(scored[0][0])
        # Optionally add a second (e.g., "title") if present
        for c, ns, al in scored[1:]:
            if ("title" in c.lower() or "headline" in c.lower()) and c not in picked:
                picked.append(c); break
    return picked

def is_binary_like(series: pd.Series) -> bool:
    """
    Check if a series looks like a binary label (~0/1/true/false/etc.).
    """
    vals = pd.Series(series.dropna().unique()).astype(str).str.lower().tolist()
    if len(vals) <= 1:
        return False
    allowed = {"0","1","true","false","yes","no","y","n","t","f"}
    return all((v in allowed) for v in vals) or (len(vals) == 2)

def pick_label_column(df: pd.DataFrame) -> Optional[str]:
    """
    Try to identify the label column using:
      - name similarity to label hints
      - small number of unique values
      - excluding known text columns
    """
    candidates = []
    for c in df.columns:
        lname = c.lower()
        name_score = sum(1 for h in LABEL_NAME_HINTS if h in lname)
        nunique = df[c].dropna().nunique()
        if nunique <= 10 or name_score > 0:
            candidates.append((c, name_score, nunique))
    candidates.sort(key=lambda x: (x[1], -x[2]), reverse=True)
    text_cols = set(pick_text_columns(df))
    for c, ns, nu in candidates:
        if c not in text_cols:
            return c
    return candidates[0][0] if candidates else None

def to_binary_label(value: object) -> Optional[int]:
    """
    Convert a raw label (string/number/category) into a binary 0/1 value.
    Returns:
        1 = suicidal/ideation/attempt
        0 = non-suicidal/control/neutral
        None = unknown/unusable
    """
    if pd.isna(value):
        return None
    s = str(value).strip().lower()
    if s in {"1","true","t","yes","y"}: return 1
    if s in {"0","false","f","no","n"}:  return 0
    if any(tok in s for tok in NON_IDEATION_TOKENS): return 0
    if any(k in s for k in ["suicid","ideat","attempt"]): return 1
    if s in {"control","neutral","others","other","none"}: return 0
    if s in {"positive","negative","pos","neg"}: return None
    try:
        fv = float(s)
        if fv == 1.0: return 1
        if fv == 0.0: return 0
        if fv == -1.0: return 0
    except Exception:
        pass
    return None

def standardize_dataset(df: pd.DataFrame, source_name: str) -> pd.DataFrame:
    """
    Convert a raw dataset into a standardized format:
        { text, label, source }
    - Picks text columns
    - Picks label column
    - Maps labels to binary
    - Drops unusable rows
    """
    tcols = pick_text_columns(df)
    if not tcols: return pd.DataFrame(columns=["text","label","source"])
    text = df[tcols[0]].astype(str)
    if len(tcols) >= 2:
        text = (df[tcols[0]].astype(str).fillna("") + " " +
                df[tcols[1]].astype(str).fillna(""))
    lcol = pick_label_column(df)
    if lcol is None: return pd.DataFrame(columns=["text","label","source"])
    mapped = df[lcol].apply(to_binary_label)

    # Fallback mapping if too many Nones
    if mapped.notna().mean() < 0.5:
        uniq = set(str(x).strip().lower() for x in df[lcol].dropna().unique().tolist())
        possible_pos = {"suicide","suicidal","ideation","attempt","si","s"}
        possible_neg = {"non-suicide","nonsuicide","non suicidal","control","neutral","others","other","no si","no_si","ns"}
        mapping = {}
        for u in uniq:
            if u in possible_pos or any(k in u for k in ["suicid","ideat","attempt"]):
                mapping[u] = 1
            elif u in possible_neg or any(k in u for k in ["non-suicid","non suicidal","nonsuicid","control","neutral","others"]):
                mapping[u] = 0
        if mapping:
            mapped2 = df[lcol].astype(str).str.strip().str.lower().map(mapping)
            mapped = mapped.where(mapped.notna(), mapped2)

    out = pd.DataFrame({"text": text.astype(str).str.strip(), "label": mapped})
    out = out.dropna(subset=["text","label"])
    out = out[out["text"].str.len() > 0]
    out["label"] = out["label"].astype(int)
    out["source"] = os.path.basename(source_name)
    return out

def remove_fillers(text):
    """
    Remove filler/noise using filler_regex and normalize whitespace.
    """
    text = filler_regex.sub(' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text



In [None]:
def main():
    # Check that input files actually exist
    existing = [p for p in INPUT_FILES if os.path.exists(p)]
    if not existing:
        raise SystemExit("No input files found. Check INPUT_FILES paths.")

    frames = []
    # Loop through input files, standardize them, and collect clean rows
    for path in existing:
        df = robust_read_csv(path)
        if df is None or df.empty:
            print(f"[SKIP] Empty or unreadable: {path}")
            continue
        std = standardize_dataset(df, path)
        if std.empty:
            print(f"[SKIP] No rows after standardizing: {path}")
            continue
        frames.append(std)
    
    # Require at least one usable dataset
    if not frames:
        raise SystemExit("No standardized datasets produced; nothing to combine.")

    combined = pd.concat(frames, ignore_index=True)
    combined["text"] = combined["text"].astype(str).apply(remove_fillers)
    before = len(combined)
    combined = combined.drop_duplicates(subset=["text"])
    dropped = before - len(combined)
    
    # Print summary JSON to console
    combined.to_csv(OUTPUT_COMBINED, index=False, encoding="utf-8")
    print(json.dumps({
        "combined_rows": int(len(combined)),
        "dropped_duplicates": int(dropped),
        "label_counts": combined["label"].value_counts().to_dict(),
        "output": OUTPUT_COMBINED,
        "sources": sorted(combined["source"].unique().tolist()),
    }, indent=2))

    if DO_SPLITS:
        from sklearn.model_selection import train_test_split
        train_df, temp_df = train_test_split(
            combined, test_size=0.20, stratify=combined["label"], random_state=RANDOM_SEED
        )
        val_df, test_df = train_test_split(
            temp_df, test_size=0.50, stratify=temp_df["label"], random_state=RANDOM_SEED
        )
        train_df.to_csv(TRAIN_OUT, index=False, encoding="utf-8")
        val_df.to_csv(VAL_OUT, index=False, encoding="utf-8")
        test_df.to_csv(TEST_OUT, index=False, encoding="utf-8")
        
        # Print split statistics
        print(json.dumps({
            "splits": {
                "train": {"rows": len(train_df), "dist": train_df["label"].value_counts(normalize=True).round(4).to_dict(), "path": TRAIN_OUT},
                "val":   {"rows": len(val_df),   "dist": val_df["label"].value_counts(normalize=True).round(4).to_dict(),   "path": VAL_OUT},
                "test":  {"rows": len(test_df),  "dist": test_df["label"].value_counts(normalize=True).round(4).to_dict(),  "path": TEST_OUT},
            }
        }, indent=2))
        
    # Determine project directory (works in notebook or .py execution)
    BASE_DIR = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()
    BASE_DIR.mkdir(parents=True, exist_ok=True)

    # Save all final artifacts to project directory
    (combined.to_csv(BASE_DIR / "combined_suicide_ideation_dataset.csv", index=False, encoding="utf-8"))
    train_df.to_csv(BASE_DIR / "combined_train.csv", index=False, encoding="utf-8")
    val_df.to_csv(BASE_DIR / "combined_val.csv", index=False, encoding="utf-8")
    test_df.to_csv(BASE_DIR / "combined_test.csv", index=False, encoding="utf-8")

    print("Saved to:", BASE_DIR.resolve())

if __name__ == "__main__":
    main()

[SKIP] No rows after standardizing: test.csv
{
  "combined_rows": 248437,
  "dropped_duplicates": 717,
  "label_counts": {
    "1": 124585,
    "0": 123852
  },
  "output": "combined_suicide_ideation_dataset.csv",
  "sources": [
    "Suicide_Detection.csv",
    "Train_suicide1.csv",
    "testset.csv",
    "twitter-suicidal_data.csv"
  ]
}
{
  "splits": {
    "train": {
      "rows": 198749,
      "dist": {
        "1": 0.5015,
        "0": 0.4985
      },
      "path": "combined_train.csv"
    },
    "val": {
      "rows": 24844,
      "dist": {
        "1": 0.5015,
        "0": 0.4985
      },
      "path": "combined_val.csv"
    },
    "test": {
      "rows": 24844,
      "dist": {
        "1": 0.5014,
        "0": 0.4986
      },
      "path": "combined_test.csv"
    }
  }
}
Saved to: C:\Users\colin\OneDrive\Desktop\DS785_Project
