In [4]:
import pandas as pd
from pathlib import Path
import glob, os

# === output name (saved in the SAME folder) ===
OUT = "combined_wellness.csv"

# 1) find all CSVs in the current working directory (except the output)
csv_files = sorted([f for f in glob.glob("*.csv") if os.path.basename(f) != OUT])
assert csv_files, "No CSV files found in the current folder."

dfs = []
canonical_cols = None  # we'll use the first file's columns as the canonical order

for fp in csv_files:
    df = pd.read_csv(fp, dtype=str, keep_default_na=False, low_memory=False)
    # normalize header spacing
    df.columns = [c.strip() for c in df.columns]

    # ensure we have a 'text' column (case-insensitive)
    text_candidates = [c for c in df.columns if c.lower() == "text"]
    if not text_candidates:
        raise ValueError(f"'text' column not found in {fp}")
    if text_candidates[0] != "text":
        df = df.rename(columns={text_candidates[0]: "text"})

    # set canonical column order from the first file
    if canonical_cols is None:
        canonical_cols = list(df.columns)
    else:
        # keep only columns that exist in the first file; add missing as 0
        missing = [c for c in canonical_cols if c not in df.columns]
        for m in missing:
            df[m] = 0
        df = df[canonical_cols]

    # coerce label columns to integers (-1/0/1 or 0/1)
    label_cols = [c for c in df.columns if c != "text"]
    for c in label_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype("int8")

    dfs.append(df)

# 2) concatenate
combined = pd.concat(dfs, ignore_index=True)

# 3) clean, dedupe by text
combined["text"] = combined["text"].astype(str).str.strip()
combined = combined[combined["text"] != ""]
combined = combined.drop_duplicates(subset="text", keep="first")

# 4) save next to your notebook
combined.to_csv(OUT, index=False, encoding="utf-8")
print(f"Saved {OUT} with shape {combined.shape} from {len(csv_files)} files.")


Saved combined_wellness.csv with shape (52361, 12) from 12 files.
