In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import duckdb
import glob
import os

# Directory containing all your cleaned .parquet files
CLEANED_DIR = "/root/Merged"
all_files = glob.glob(os.path.join(CLEANED_DIR, "*_merged.parquet"))

# Step 1: Load and preprocess data
df_list = []
for file in all_files:
    print(f"Processing {os.path.basename(file)}...")
    try:
        # Load relevant columns
        df = duckdb.sql(f"""
            SELECT asin, rating, brand, main_category
            FROM '{file}'
            WHERE rating BETWEEN 1 AND 5
        """).df()
        df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Combine all data
df = pd.concat(df_list, ignore_index=True)

# Compute features
print("Computing features...")
product_features = df.groupby('asin').agg(
    mean_rating=('rating', 'mean'),
    total_reviews=('rating', 'count'),
    brand=('brand', 'first'),
    category=('main_category', 'first')
).reset_index()

# Encode brand and category as integers
le_brand = LabelEncoder()
le_category = LabelEncoder()
product_features['brand_id'] = le_brand.fit_transform(product_features['brand'].fillna('Unknown'))
product_features['category_id'] = le_category.fit_transform(product_features['category'].fillna('Unknown'))

# Select features for clustering
X = product_features[['mean_rating', 'total_reviews', 'brand_id', 'category_id']].fillna(0)

# Step 2: Apply k-means clustering
print("Applying k-means clustering...")
kmeans = KMeans(n_clusters=5, random_state=42)
product_features['cluster'] = kmeans.fit_predict(X)

# Step 3: Analyze clusters
print("Analyzing clusters...")
cluster_analysis = product_features.groupby('cluster').agg(
    size=('asin', 'count'),
    avg_mean_rating=('mean_rating', 'mean'),
    avg_total_reviews=('total_reviews', 'mean'),
    avg_brand_id=('brand_id', 'mean'),
    avg_category_id=('category_id', 'mean')
).reset_index()

# Add interpretations
cluster_analysis['interpretation'] = [
    "High-rating electronics",  # Example interpretation
    "Unknown-brand items",      # Example interpretation
    "Low-rating products",      # Example interpretation
    "High-review count items",  # Example interpretation
    "Miscellaneous"             # Example interpretation
]

# Display results
print(cluster_analysis)