In [None]:
# ======================================================
# 0) Load and clean full dataset, then sample 200 rows and plot distribution
# ======================================================
import os
import io
import csv
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", 200)
RNG = np.random.RandomState(42)

CLEAN_SOURCE = 'debug_cleaned.csv'
RAW_SOURCE = '../data_files/dehum_data_explicit.csv'

# ------------------------------------------------------
# Helper: strip junk, sniff delimiter, rebuild rows permissively
# ------------------------------------------------------
def load_and_reconstruct(path):
    print(f"Loading raw text from {path}")
    with open(path, 'r', encoding='utf-8', errors='replace') as fh:
        raw = fh.read()

    raw = raw.replace(';;;;;;;;;;;;;;;;;;;;;;', '')
    raw = re.sub(r";+\s*$", "", raw, flags=re.MULTILINE)

    lines = raw.splitlines()
    if not lines:
        raise ValueError("File is empty after cleaning")

    sample = '\n'.join(lines[:50])
    try:
        delim = csv.Sniffer().sniff(sample, delimiters=[',', ';', '\t', '|']).delimiter
    except Exception:
        delim = ','
    print(f"Detected delimiter: {repr(delim)}")

    header_cols = [c.strip() for c in lines[0].split(delim)]
    ncols = len(header_cols)
    print(f"Header columns detected: {ncols}")

    reconstructed = []
    buffer_lines = []
    expected_seps = ncols - 1

    def flush_buffer(buf):
        chunk = '\n'.join(buf)
        try:
            parsed = next(csv.reader([chunk], delimiter=delim, quotechar='"'))
            if len(parsed) == ncols:
                return parsed
        except Exception:
            pass
        try:
            parsed_alt = next(csv.reader([chunk.replace('"', "'"], delimiter=delim, quotechar="'")))
            if len(parsed_alt) == ncols:
                return parsed_alt
        except Exception:
            pass
        pieces = chunk.split(delim)
        if len(pieces) < ncols:
            pieces.extend([''] * (ncols - len(pieces)))
        elif len(pieces) > ncols:
            pieces = pieces[:ncols]
        return [p.strip() for p in pieces]

    current_sep_count = 0
    for line in lines[1:]:
        buffer_lines.append(line)
        current_sep_count += line.count(delim)
        if current_sep_count >= expected_seps:
            reconstructed.append(flush_buffer(buffer_lines))
            buffer_lines = []
            current_sep_count = 0

    if buffer_lines:
        reconstructed.append(flush_buffer(buffer_lines))
        print("Warning: leftover buffer flushed with permissive split.")

    print(f"Reconstructed rows: {len(reconstructed)}")
    df = pd.DataFrame(reconstructed, columns=[c.strip().lower() for c in header_cols])
    return df

# ------------------------------------------------------
# Load data (prefer cleaned source if available)
# ------------------------------------------------------
source_path = CLEAN_SOURCE if os.path.exists(CLEAN_SOURCE) else RAW_SOURCE
print(f"Primary source selected: {source_path}")
try:
    df = pd.read_csv(source_path, sep=',', engine='python', dtype=str)
    print("Direct pandas read succeeded.")
except Exception as e:
    print(f"Direct read failed: {repr(e)}")
    df = load_and_reconstruct(source_path)

if df.empty:
    print("Initial DataFrame empty; forcing reconstruction.")
    df = load_and_reconstruct(source_path)

# Drop unnamed/index-like columns and empty first column
df = df.loc[:, [not str(c).lower().startswith('unnamed') for c in df.columns]]
if '' in df.columns and df[''].astype(str).str.strip().eq('').all():
    df = df.drop(columns=[''])

# Persist the cleaned full dataset for reuse
full_out = 'clean_full.csv'
df.to_csv(full_out, index=False)
print(f"Saved cleaned dataset to {full_out} (rows={len(df)})")

# ------------------------------------------------------
# Ensure label column and numeric conversions
# ------------------------------------------------------
label_cols = ['animal','subhuman','disease','inanimate','agg_animal']
if 'exp_dehum_binary' not in df.columns:
    present = [c for c in label_cols if c in df.columns]
    if present:
        print('Inferring exp_dehum_binary from subtype columns:', present)
        df[present] = df[present].fillna('0')
        df['exp_dehum_binary'] = (df[present].astype(float).sum(axis=1) > 0).astype(int)
    elif 'explicit' in df.columns:
        print('Using explicit column to approximate exp_dehum_binary.')
        df['exp_dehum_binary'] = df['explicit'].astype(str).str.lower().replace({'true':'1','false':'0','yes':'1','no':'0','nan':'0','none':'0','': '0'}).astype(int)
    else:
        print('No subtype/explicit columns found; setting exp_dehum_binary to 0.')
        df['exp_dehum_binary'] = 0

df['exp_dehum_binary'] = pd.to_numeric(df['exp_dehum_binary'], errors='coerce').fillna(0).astype(int)

# ------------------------------------------------------
# Sample 200 rows (or fewer) and plot distribution
# ------------------------------------------------------
N = 200
n_take = min(N, len(df))
if n_take < N:
    print(f"Dataset has only {len(df)} rows; sampling all available rows.")
sample_df = df.sample(n=n_take, random_state=RNG).reset_index(drop=True)

sample_df['dehumanized_label'] = sample_df['exp_dehum_binary'].map({1: 'dehumanized', 0: 'not_dehumanized'})
counts = sample_df['dehumanized_label'].value_counts().reindex(['dehumanized','not_dehumanized']).fillna(0).astype(int)
perc = (counts / counts.sum() * 100).round(1)

print('\nSample counts:')
for label, value in counts.items():
    print(f"  {label}: {value} ({perc[label]}%)")

sample_out = 'sample_200.csv'
sample_df.to_csv(sample_out, index=False)
print(f"Saved sample to {sample_out}")

plt.figure(figsize=(6,4))
ax = counts.plot(kind='bar', color=['#d62728','#1f77b4'])
plt.title(f'Dehumanization distribution in random sample (n={n_take})')
plt.ylabel('Count')
plt.xticks(rotation=0)
for p in ax.patches:
    ax.annotate(f"{int(p.get_height())}", (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='bottom')
plt.tight_layout()
plt.show()

print('\nFirst 5 rows of the sample:')
cols_to_show = [c for c in ['instance_id','displayed_text','exp_dehum_binary'] if c in sample_df.columns]
print(sample_df.head()[cols_to_show].fillna('').to_string(index=False))
