In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, StandardScaler
import duckdb
import glob
import os

# Directory containing cleaned parquet files
CLEANED_DIR = r"/root/deduped_mix"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

product_feature_list = []

con = duckdb.connect()

for file in all_files:
    print(f"Processing {os.path.basename(file)}...")
    try:
        # Aggregate per file directly inside DuckDB!
        df = con.execute(f"""
            SELECT asin,
                   AVG(rating) AS mean_rating,
                   COUNT(rating) AS total_reviews,
                   FIRST(brand) AS brand,
                   FIRST(main_category) AS category
            FROM read_parquet('{file}', union_by_name=True)
            WHERE rating BETWEEN 1 AND 5
            GROUP BY asin
            USING SAMPLE BERNOULLI(0.5 PERCENT)  -- super lightweight sampling
        """).fetchdf()
        
        product_feature_list.append(df)

    except Exception as e:
        print(f"Error reading {file}: {e}")

con.close()

# Now concatenate small aggregates
product_features = pd.concat(product_feature_list, ignore_index=True)

# Encode brand and category
le_brand = LabelEncoder()
le_category = LabelEncoder()
product_features['brand_id'] = le_brand.fit_transform(product_features['brand'].fillna('Unknown'))
product_features['category_id'] = le_category.fit_transform(product_features['category'].fillna('Unknown'))

# Feature matrix
X = product_features[['mean_rating', 'total_reviews', 'brand_id', 'category_id']].fillna(0)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# KMeans
print("Applying k-means clustering...")
kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
product_features['cluster'] = kmeans.fit_predict(X_scaled)

# Cluster Analysis
print("Analyzing clusters...")
cluster_analysis = product_features.groupby('cluster').agg(
    size=('asin', 'count'),
    avg_mean_rating=('mean_rating', 'mean'),
    avg_total_reviews=('total_reviews', 'mean'),
    avg_brand_id=('brand_id', 'mean'),
    avg_category_id=('category_id', 'mean')
).reset_index()

print(cluster_analysis)


Processing Video_Games_merged.parquet...
Processing Amazon_Fashion_merged.parquet...
Processing Software_merged.parquet...
Processing Health_and_Personal_Care_merged.parquet...
Processing Arts_Crafts_and_Sewing_merged.parquet...
Processing Home_and_Kitchen_merged.parquet...
Processing Handmade_Products_merged.parquet...
Processing Baby_Products_merged.parquet...
Processing Unknown_merged.parquet...
Processing Electronics_merged.parquet...
Processing CDs_and_Vinyl_merged.parquet...
Processing Digital_Music_merged.parquet...
Processing Patio_Lawn_and_Garden_merged.parquet...
Processing Office_Products_merged.parquet...
Processing Beauty_and_Personal_Care_merged.parquet...
Error reading /root/deduped_mix/Beauty_and_Personal_Care_merged.parquet: Binder Error: Referenced column "brand" not found in FROM clause!
Candidate bindings: "rating", "rating_number", "user_id", "main_category", "parent_asin"

LINE 5:                    FIRST(brand) AS brand,
                                 ^
Process

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce dimensions to 2D with PCA for visualization
print("Reducing dimensions with PCA...")
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot the clusters
print("Plotting cluster map...")
plt.figure(figsize=(10, 7))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=product_features['cluster'], cmap='tab10', alpha=0.7)
plt.title('Product Clusters (PCA Reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(scatter, label='Cluster')
plt.grid(True)
plt.show()


Reducing dimensions with PCA...
Plotting cluster map...
