# Clustering

Setup environment <br>
Import relevant libraries

In [14]:
# %pip install scikit-learn
# %pip install matplotlib
# %pip install seaborn
# %pip install pandas

from bigdata_a3_utils import *
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os, pickle, gc

Define bath path

In [None]:
# base_path = Path('D:\COMP3610A3\dataframes')

## Load Categories

Define a function to combine the dataframes into one for analysis

In [None]:
# def get_dataset(base_path, category):
#     combined_df = pd.DataFrame()
    
#     for category in VALID_CATEGORIES:
#         try:
#             pickle_path = os.path.join(base_path, f'cleaned_data_{category}.pkl')
#             category_df = pd.read_pickle(pickle_path)
#         except Exception as e:
#             print(f"Error processing category {category}: {e}")
#             continue
        
#         print(f"Processing category: {category}")
#         category_df = category_df[['parent_asin', 'rating', 'brand', 'main_category']]
#         combined_df = pd.concat([combined_df, category_df], ignore_index=True)
        
#         del category_df
#         gc.collect()
    
#     if combined_df.empty:
#         print("No data available after processing all categories...")
#         return None
    
#     return combined_df
        

Now get the dataframe for the k-means clustering algorithm

In [None]:
# df = get_dataset(base_path, VALID_CATEGORIES)
# df.head()

In [None]:
def get_needed_cols(base_path, category):
    #columns we want
    needed= ['parent_asin', 'rating', 'brand', 'main_category']
    #read in pickle file
    print(f"Reading in {category} data")
    df= pd.read_pickle(base_path / f"cleaned_data_{category}.pkl")
    print(f"Finished reading in {category} data")
    
    columns_to_drop = df.columns.difference(needed)
    df.drop(columns=columns_to_drop, inplace=True)
    print("Dropped unneeded columns")
    gc.collect()
    return df

In [None]:
base_path= Path(r"F:\Saeed\frames")

dfs = []

#test run. Please run this to ensure you are getting the categories in the list below
#Once working, comment test code and uncomment real code and then Run All
categories= VALID_CATEGORIES
for category in categories:
    x = get_needed_cols(base_path, category)
    dfs.append(x)
    gc.collect()

df = pd.concat(dfs, ignore_index=True)

## Features

In [18]:
# Compute mean rating and total reviews for products
product_df = df.groupby('parent_asin').agg(
    mean_rating=('rating', 'mean'),
    total_reviews=('rating', 'count'),
    brand=('brand', lambda x: x.iloc[0]),
    main_category=('main_category', lambda x: x.iloc[0]),
).reset_index()

# Delete df to free up memory
del df
gc.collect()

# Encode categorical features, brand and main_category
print(f"Encoding categorical features...")
product_df['brand'] = product_df['brand'].fillna('Unknown')
product_df['main_category'] = product_df['main_category'].fillna('Unknown')

# Apply label encoding 
encoder_brand = LabelEncoder()
encoder_main_category = LabelEncoder()

product_df['brand_id'] = encoder_brand.fit_transform(product_df['brand'])
product_df['category_id'] = encoder_main_category.fit_transform(product_df['main_category'])
product_df = product_df.drop(columns=['brand', 'main_category'])

# Prepare matrix
features = product_df[['mean_rating', 'total_reviews', 'brand_id', 'category_id']]

product_df.head()


Encoding categorical features...


Unnamed: 0,parent_asin,mean_rating,total_reviews,brand_id,category_id
0,41386,1.0,1,355584,28
1,98906,4.203704,108,364099,29
2,143499,5.0,1,15874,28
3,143502,5.0,1,15872,28
4,143529,5.0,1,15872,41


## k-means

In [19]:
# Apply KMeans clustering
print(f"Applying KMeans clustering...")
kmeans = KMeans(n_clusters=5, random_state=42)
product_df['cluster'] = kmeans.fit_predict(features)

product_df.head()

Applying KMeans clustering...


Unnamed: 0,parent_asin,mean_rating,total_reviews,brand_id,category_id,cluster
0,41386,1.0,1,355584,28,1
1,98906,4.203704,108,364099,29,1
2,143499,5.0,1,15874,28,0
3,143502,5.0,1,15872,28,0
4,143529,5.0,1,15872,41,0


## Cluster Analysis

In [20]:
# Analyse clusters
cluster_analysis = product_df.groupby('cluster').agg(
    cluster_size=('parent_asin', 'count'),
    avg_rating=('mean_rating', 'mean'),
    avg_total_reviews=('total_reviews', 'mean'),
    avg_brand_id=('brand_id', 'mean'),
    avg_category_id=('category_id', 'mean')
).reset_index()

for idx, row in cluster_analysis.iterrows():
    print(f"Cluster {int(row['cluster'])}:")
    print(f" - Size: {int(row['cluster_size'])}")
    print(f" - Avg Rating: {row['avg_rating']}")
    print(f" - Avg Total Reviews: {row['avg_total_reviews']}")
    print(f" - Avg Brand ID: {row['avg_brand_id']}")
    print(f" - Avg Category ID: {row['avg_category_id']}")


Cluster 0:
 - Size: 343493
 - Avg Rating: 4.096760928471286
 - Avg Total Reviews: 23.193969018291494
 - Avg Brand ID: 38832.842171456185
 - Avg Category ID: 27.758891738696278
Cluster 1:
 - Size: 738255
 - Avg Rating: 3.9597142325450165
 - Avg Total Reviews: 24.806543809388355
 - Avg Brand ID: 357416.28764180397
 - Avg Category ID: 30.625873173903326
Cluster 2:
 - Size: 239465
 - Avg Rating: 4.073790872545362
 - Avg Total Reviews: 22.303443091892344
 - Avg Brand ID: 208567.7760466039
 - Avg Category ID: 27.846228050028188
Cluster 3:
 - Size: 410615
 - Avg Rating: 4.073248762936489
 - Avg Total Reviews: 22.906963944327412
 - Avg Brand ID: 279537.120616636
 - Avg Category ID: 28.235115619254046
Cluster 4:
 - Size: 431839
 - Avg Rating: 4.054319535161279
 - Avg Total Reviews: 19.577828774149626
 - Avg Brand ID: 123563.33120908486
 - Avg Category ID: 28.116897732719835


## Interpretation