In [1]:
import os
import glob
import duckdb
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Path to cleaned parquet files
CLEANED_DIR = r"/root/cleaned_parquets"

print("Scanning for files...")
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))
print(f"Found {len(all_files)} files")

# Step 1: Collect small batches from each file
dfs = []
for f in all_files:
    try:
        print(f"Reading {os.path.basename(f)}...")
        df = duckdb.sql(
            f"""
            SELECT text, rating
            FROM '{f}'
            WHERE rating BETWEEN 1 AND 5 AND text IS NOT NULL
            LIMIT 1000000
            """
        ).df()

        df = df[df["text"].str.strip().astype(bool)]  # Remove empty strings
        df["sentiment"] = df["rating"].apply(lambda x: 1 if x > 3 else 0)
        dfs.append(df)

    except Exception as e:
        print(f"Skipped {os.path.basename(f)} due to error: {e}")

# Step 2: Combine and clean
if not dfs:
    raise ValueError("No valid reviews were loaded.")

df = pd.concat(dfs, ignore_index=True)
print(f"Total samples loaded: {len(df):,}")

# Step 3: Vectorize text using TF-IDF
print("Vectorizing text...")
vectorizer = TfidfVectorizer(min_df=5, max_df=0.8, stop_words='english')
X = vectorizer.fit_transform(df["text"])
y = df["sentiment"]

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Train classifier
print("Training model...")
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Step 6: Evaluate
print("Evaluation")
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Scanning for files...
Found 32 files
Reading Video_Games_merged.parquet...
Reading Amazon_Fashion_merged.parquet...
Reading Software_merged.parquet...
Reading Health_and_Personal_Care_merged.parquet...
Reading Musical_Instruments_merged.parquet...
Reading Arts_Crafts_and_Sewing_merged.parquet...
Reading Home_and_Kitchen_merged.parquet...
Skipped Home_and_Kitchen_merged.parquet due to error: Invalid Input Error: No magic bytes found at end of file '/root/cleaned_parquets/Home_and_Kitchen_merged.parquet'
Reading Handmade_Products_merged.parquet...
Reading Baby_Products_merged.parquet...
Reading Electronics_merged.parquet...
Reading CDs_and_Vinyl_merged.parquet...
Reading Digital_Music_merged.parquet...
Reading Patio_Lawn_and_Garden_merged.parquet...
Reading Office_Products_merged.parquet...
Reading Beauty_and_Personal_Care_merged.parquet...
Reading Kindle_Store_merged.parquet...
Skipped Kindle_Store_merged.parquet due to error: Invalid Input Error: No magic bytes found at end of file '/r

: 