# Clustering

Setup environment <br>
Import relevant libraries

In [2]:
# %pip install scikit-learn
# %pip install matplotlib
# %pip install seaborn
# %pip install pandas

from bigdata_a3_utils import *
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os, pickle, gc

## Load Categories

Define bath path

In [3]:
base_path = Path('D:\COMP3610A3\dataframes')

Main code loop below

In [5]:
for category in ['Movies_and_TV']:
    # Load the reviews and metadata for the specified category
    pickle_path = os.path.join(base_path, f'cleaned_data_{category}.pkl')
    df = pd.read_pickle(pickle_path)
    
    if df.empty:
        print(f"No data found for category: {category}")
        continue
    
    print(f"Loaded df for category: {category}")
    
    # Compute mean rating and total reviews for product
    print(f"Computing mean rating and total reviews for {category}...")
    aggregate_df = df.groupby('parent_asin').agg(
        mean_rating=('rating', 'mean'),
        total_reviews=('rating', 'count'),
        brand=('details', lambda x: x.iloc[0].get('brand') if isinstance(x.iloc[0], dict) else None),
        main_category=('main_category', lambda x: x.iloc[0]),
    ).reset_index()
    
    # Delete df to free up memory
    print(f"Deleting df to free up memory...")
    del df
    gc.collect()
    
    # Encode categorical features, brand and main_category
    print(f"Encoding categorical features...")
    aggregate_df['brand'] = aggregate_df['brand'].fillna('Unknown')
    aggregate_df['main_category'] = aggregate_df['main_category'].fillna('Unknown')
    
    # Apply label encoding 
    encoder_brand = LabelEncoder()
    encoder_main_category = LabelEncoder()
    
    aggregate_df['brand_id'] = encoder_brand.fit_transform(aggregate_df['brand'])
    aggregate_df['category_id'] = encoder_main_category.fit_transform(aggregate_df['main_category'])
    
    # Drop irrelevant columns
    product_df = aggregate_df[['parent_asin', 'mean_rating', 'total_reviews', 'brand_id', 'category_id']]
    
    # Delete aggregate_df to free up memory
    print(f"Deleting aggregate_df to free up memory...")
    del aggregate_df
    gc.collect()
    
    # Prepare matrix
    features = product_df[['mean_rating', 'total_reviews', 'brand_id', 'category_id']]
    
    # Apply KMeans clustering
    print(f"Applying KMeans clustering...")
    kmeans = KMeans(n_clusters=5, random_state=42)
    product_df['cluster'] = kmeans.fit_predict(features)
    
    # Analyse clusters
    cluster_analysis = product_df.groupby('cluster').agg(
        cluster_size=('parent_asin', 'count'),
        avg_rating=('mean_rating', 'mean'),
        avg_total_reviews=('total_reviews', 'mean'),
        avg_brand_id=('brand_id', 'mean'),
        avg_category_id=('category_id', 'mean')
    ).reset_index()
    
    print(f"Cluster analysis for {category}:")
    for idx, row in cluster_analysis.iterrows():
        print(f"Cluster {int(row['cluster'])}:")
        print(f" - Size: {row['cluster_size']}")
        print(f" - Avg Rating: {row['avg_rating']}")
        print(f" - Avg Total Reviews: {row['avg_total_reviews']}")
        print(f" - Avg Brand ID: {row['avg_brand_id']}")
        print(f" - Avg Category ID: {row['avg_category_id']}")
        print(f" - Interpretation: ")
        print()


    

Loaded df for category: Movies_and_TV
Computing mean rating and total reviews for Movies_and_TV...
Deleting df to free up memory...
Encoding categorical features...
Deleting aggregate_df to free up memory...
Applying KMeans clustering...
Cluster analysis for Movies_and_TV:
Cluster 0:
 - Size: 743508.0
 - Avg Rating: 3.979678516824734
 - Avg Total Reviews: 14.988901262662944
 - Avg Brand ID: 0.0
 - Avg Category ID: 21.339626473420594
 - Interpretation: 

Cluster 1:
 - Size: 231.0
 - Avg Rating: 4.359838165093903
 - Avg Total Reviews: 5751.233766233766
 - Avg Brand ID: 0.0
 - Avg Category ID: 22.861471861471863
 - Interpretation: 

Cluster 2:
 - Size: 37.0
 - Avg Rating: 4.509718834332255
 - Avg Total Reviews: 17649.216216216217
 - Avg Brand ID: 0.0
 - Avg Category ID: 23.0
 - Interpretation: 

Cluster 3:
 - Size: 3.0
 - Avg Rating: 4.591263527049477
 - Avg Total Reviews: 55158.666666666664
 - Avg Brand ID: 0.0
 - Avg Category ID: 23.0
 - Interpretation: 

Cluster 4:
 - Size: 3960.0
 - A