In [3]:
import os
import glob
import duckdb
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Path to your cleaned dataset
CLEANED_DIR = "/root/deduped_mix"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

print(f"Scanning {len(all_files)} files...")

sample_rows = 100000 
dfs = []

for f in all_files:
    try:
        df = duckdb.sql(f"""
            SELECT text, rating
            FROM read_parquet('{f}', union_by_name=True)
            WHERE rating BETWEEN 1 AND 5 AND text IS NOT NULL
            USING SAMPLE BERNOULLI(0.3 PERCENT)
            LIMIT {sample_rows}
        """).df()
        
        df = df[df["text"].str.strip().astype(bool)]
        df["sentiment"] = (df["rating"] > 3).astype(int)
        dfs.append(df)
    except Exception as e:
        print(f"Skipped {os.path.basename(f)}: {e}")

if not dfs:
    raise RuntimeError("No valid data loaded.")

df = pd.concat(dfs, ignore_index=True).sample(frac=1.0, random_state=42)
print(f"Total samples: {len(df):,}")

# TF-IDF vectorization (exact criteria)
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    min_df=5,
    max_df=0.8,  # 80% max document frequency
)

X = vectorizer.fit_transform(df["text"])
y = df["sentiment"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

# Logistic Regression classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("\n=== Binary Sentiment Classification Results ===")
print(f"Accuracy     : {acc:.4f}")
print(f"F1 Score     : {f1:.4f}")
print(f"Confusion Matrix:\n{cm}")


Scanning 33 files...


Total samples: 1,376,163

=== Binary Sentiment Classification Results ===
Accuracy     : 0.8890
F1 Score     : 0.9307
Confusion Matrix:
[[ 39423  21012]
 [  9551 205247]]
