# Clustering

Setup environment <br>
Import relevant libraries

In [1]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "16"

In [2]:
from bigdata_a3_utils import *
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os, pickle, gc

## Load Categories

Define a function to combine the dataframes into one for analysis

Now get the dataframe for the k-means clustering algorithm

In [3]:
def get_needed_cols(base_path, category):
    #columns we want
    needed= ['parent_asin', 'rating', 'brand', 'main_category']
    #read in pickle file
    print(f"Reading in {category} data")
    df= pd.read_pickle(base_path / f"cleaned_data_{category}.pkl") #.sample(frac=0.5, random_state=42)
    print(f"Finished reading in {category} data")
    
    columns_to_drop = df.columns.difference(needed)
    df.drop(columns=columns_to_drop, inplace=True)
    print("Dropped unneeded columns")
    gc.collect()
    return df

In [4]:
base_path= Path(r"F:\Saeed\frames")

dfs = []

#test run. Please run this to ensure you are getting the categories in the list below
#Once working, comment test code and uncomment real code and then Run All
categories= VALID_CATEGORIES
for category in categories:
    x = get_needed_cols(base_path, category)
    dfs.append(x)
    gc.collect()

df = pd.concat(dfs, ignore_index=True)

Reading in All_Beauty data
Finished reading in All_Beauty data
Dropped unneeded columns
Reading in Amazon_Fashion data
Finished reading in Amazon_Fashion data
Dropped unneeded columns
Reading in Appliances data
Finished reading in Appliances data
Dropped unneeded columns
Reading in Arts_Crafts_and_Sewing data
Finished reading in Arts_Crafts_and_Sewing data
Dropped unneeded columns
Reading in Automotive data
Finished reading in Automotive data
Dropped unneeded columns
Reading in Baby_Products data
Finished reading in Baby_Products data
Dropped unneeded columns
Reading in Beauty_and_Personal_Care data
Finished reading in Beauty_and_Personal_Care data
Dropped unneeded columns
Reading in Books data
Finished reading in Books data
Dropped unneeded columns
Reading in CDs_and_Vinyl data
Finished reading in CDs_and_Vinyl data
Dropped unneeded columns
Reading in Cell_Phones_and_Accessories data
Finished reading in Cell_Phones_and_Accessories data
Dropped unneeded columns
Reading in Clothing_Shoe

## Features

In [5]:
# Compute mean rating and total reviews for products
product_df = df.groupby('parent_asin').agg(
    mean_rating=('rating', 'mean'),
    total_reviews=('rating', 'count'),
    brand=('brand', lambda x: x.iloc[0]),
    main_category=('main_category', lambda x: x.iloc[0]),
).reset_index()

# Delete df to free up memory
del df
gc.collect()

# Encode categorical features, brand and main_category
print(f"Encoding categorical features...")
product_df['brand'] = product_df['brand'].fillna('Unknown')
product_df['main_category'] = product_df['main_category'].fillna('Unknown')

# Apply label encoding 
encoder_brand = LabelEncoder()
encoder_main_category = LabelEncoder()

product_df['brand_id'] = encoder_brand.fit_transform(product_df['brand'])
product_df['category_id'] = encoder_main_category.fit_transform(product_df['main_category'])
product_df = product_df.drop(columns=['brand', 'main_category'])

# Prepare matrix
features = product_df[['mean_rating', 'total_reviews', 'brand_id', 'category_id']]

product_df.head()


Encoding categorical features...


Unnamed: 0,parent_asin,mean_rating,total_reviews,brand_id,category_id
0,116,2.5,2,4312203,15
1,2488,2.0,1,1456546,15
2,4545,5.0,1,588752,15
3,5371,5.0,1,1313099,15
4,12297,1.0,1,524009,15


## k-means

In [6]:
# Apply KMeans clustering
print(f"Applying KMeans clustering...")
kmeans = KMeans(n_clusters=5, random_state=42)
product_df['cluster'] = kmeans.fit_predict(features)

product_df.head()

Applying KMeans clustering...


[WinError 2] The system cannot find the file specified
  File "f:\Saeed\.conda\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "f:\Saeed\.conda\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "f:\Saeed\.conda\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "f:\Saeed\.conda\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Unnamed: 0,parent_asin,mean_rating,total_reviews,brand_id,category_id,cluster
0,116,2.5,2,4312203,15,1
1,2488,2.0,1,1456546,15,0
2,4545,5.0,1,588752,15,3
3,5371,5.0,1,1313099,15,0
4,12297,1.0,1,524009,15,3


## Cluster Analysis

In [7]:
# Analyse clusters
cluster_analysis = product_df.groupby('cluster').agg(
    cluster_size=('parent_asin', 'count'),
    avg_rating=('mean_rating', 'mean'),
    avg_total_reviews=('total_reviews', 'mean'),
    avg_brand_id=('brand_id', 'mean'),
    avg_category_id=('category_id', 'mean')
).reset_index()

for idx, row in cluster_analysis.iterrows():
    print(f"Cluster {int(row['cluster'])}:")
    print(f" - Size: {int(row['cluster_size'])}")
    print(f" - Avg Rating: {row['avg_rating']}")
    print(f" - Avg Total Reviews: {row['avg_total_reviews']}")
    print(f" - Avg Brand ID: {row['avg_brand_id']}")
    print(f" - Avg Category ID: {row['avg_category_id']}")


Cluster 0:
 - Size: 7385728
 - Avg Rating: 4.1086752011025345
 - Avg Total Reviews: 13.747635304197502
 - Avg Brand ID: 1530420.5619264615
 - Avg Category ID: 19.489696208687892
Cluster 1:
 - Size: 8065344
 - Avg Rating: 4.059386538538163
 - Avg Total Reviews: 13.889174100943494
 - Avg Brand ID: 4497649.942030867
 - Avg Category ID: 20.55063888161497
Cluster 2:
 - Size: 5286987
 - Avg Rating: 4.158852206386982
 - Avg Total Reviews: 13.498763095123934
 - Avg Brand ID: 2601137.032048121
 - Avg Category ID: 18.54691301491757
Cluster 3:
 - Size: 7519427
 - Avg Rating: 4.1219451407002525
 - Avg Total Reviews: 14.845236744767918
 - Avg Brand ID: 480334.2946079535
 - Avg Category ID: 18.40630103862967
Cluster 4:
 - Size: 7107842
 - Avg Rating: 4.121616815562576
 - Avg Total Reviews: 14.97391528961955
 - Avg Brand ID: 3518388.951001443
 - Avg Category ID: 19.222148297612694


## Interpretation