In [17]:
import os, sys

# Go up two levels from notebooks/reproducibility to the repo root
repo_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

print("Repo root:", repo_root, "Contents:", os.listdir(repo_root))

import pandas as pd
from src.preprocessing import load_data, split_and_save, fit_tfidf, save_vectorizer

LABELS = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']


Repo root: C:\Users\ual-laptop\Toxic_Bias_Audit Contents: ['.git', '.virtual_documents', 'anaconda_projects', 'data', 'Dockerfile', 'environment.yml', 'ethical_audit', 'experiments', 'notebooks', 'README.md', 'report', 'src', 'tests']


Load & Inspect Raw Data

In [18]:
# Load and clean raw data
raw_csv = os.path.join(repo_root, 'data', 'raw', 'train.csv')
df = load_data(raw_csv)

# Sanity checks
print(f"Total samples after dropping NAs: {df.shape[0]}")
print("Label counts:")
print(df[LABELS].sum().sort_values(ascending=False))


Total samples after dropping NAs: 159571
Label counts:
toxic            15294
obscene           8449
insult            7877
severe_toxic      1595
identity_hate     1405
threat             478
dtype: int64


Train/Validation Split & Save

In [19]:
# split and save to disk
processed_dir = os.path.join(repo_root, 'data', 'processed')

# This writes the files but does not return values
split_and_save(
    df,
    labels=LABELS,
    test_size=0.2,
    random_state=42,
    output_dir=processed_dir
)

print(f"Train/validation CSVs written to {processed_dir}")

# Now load them back into DataFrames
train_df = pd.read_csv(os.path.join(processed_dir, 'train.csv'))
val_df   = pd.read_csv(os.path.join(processed_dir, 'val.csv'))

print(f"Loaded train → {train_df.shape}, val → {val_df.shape}")


Train/validation CSVs written to C:\Users\ual-laptop\Toxic_Bias_Audit\data\processed
Loaded train → (127656, 7), val → (31915, 7)


Fit & Persist TF-IDF Vectorizer

In [20]:
# Fit TF-IDF on training texts only
tfidf = fit_tfidf(
    train_df['comment_text'],
    max_features=10000,
    ngram_range=(1,2),
    stop_words='english'
)

# Save the fitted vectorizer
tfidf_path = os.path.join(processed_dir, 'tfidf.pkl')
save_vectorizer(tfidf, output_path=tfidf_path)

print(f"TF-IDF vectorizer saved to {tfidf_path}")


TF-IDF vectorizer saved to C:\Users\ual-laptop\Toxic_Bias_Audit\data\processed\tfidf.pkl


Verification

In [21]:
# Reload processed CSVs
train_check = pd.read_csv(os.path.join(processed_dir, 'train.csv'))
val_check   = pd.read_csv(os.path.join(processed_dir, 'val.csv'))
print("Processed train shape:", train_check.shape)
print("Processed val shape:  ", val_check.shape)

# Load and inspect the vectorizer
import pickle
with open(os.path.join(processed_dir, 'tfidf.pkl'), 'rb') as f:
    vec = pickle.load(f)
print("TF-IDF vocab size:", len(vec.vocabulary_))
# Optionally show the first 10 feature names
print("Sample features:", list(vec.vocabulary_.keys())[:10])


Processed train shape: (127656, 7)
Processed val shape:   (31915, 7)
TF-IDF vocab size: 10000
Sample features: ['unfounded', 'personal', 'attacks', 'talk', 'page', 'just', 'gets', 'better', 'suppose', 'blame']
