In [1]:
!pip install pandas scikit-learn




In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering
from itertools import combinations


In [3]:
from google.colab import files
uploaded = files.upload()

# The uploaded file should be 'online_shoppers_intention.csv'
df = pd.read_csv('online_shoppers_intention.csv')
df.head()


Saving online_shoppers_intention.csv to online_shoppers_intention.csv


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [4]:
# Convert 'Weekend' and 'Revenue' to 0/1
df['Weekend'] = df['Weekend'].astype(int)
df['Revenue'] = df['Revenue'].astype(int)

# Mean Encoding Function
def mean_encode(df, column, target='Revenue'):
    mean_map = df.groupby(column)[target].mean()
    return df[column].map(mean_map)

# Apply Mean Encoding
df['Month'] = mean_encode(df, 'Month')
df['VisitorType'] = mean_encode(df, 'VisitorType')

# Drop high-cardinality or unused categorical columns
df.drop(columns=['OperatingSystems', 'Browser', 'Region', 'TrafficType'], inplace=True)

df.head()


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,0.016304,0.139323,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,0.016304,0.139323,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,0.016304,0.139323,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,0.016304,0.139323,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,0.016304,0.139323,1,0


In [5]:
# Features (excluding Revenue)
X = df.drop(columns=['Revenue'])

# Ground truth from 'Revenue'
y = df['Revenue'].values


In [6]:
# KMeans
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans_labels = kmeans.fit_predict(X)

# Agglomerative Clustering (Complete Linkage)
agnes = AgglomerativeClustering(n_clusters=4, linkage='complete')
agnes_labels = agnes.fit_predict(X)


In [7]:
def rand_index(true_labels, cluster_labels):
    m = len(true_labels)
    S = D = 0
    for i, j in combinations(range(m), 2):
        same_true = true_labels[i] == true_labels[j]
        same_cluster = cluster_labels[i] == cluster_labels[j]
        if same_true and same_cluster:
            S += 1
        elif not same_true and not same_cluster:
            D += 1
    RI = 2 * (S + D) / (m * (m - 1))
    return RI


In [8]:
ri_kmeans = rand_index(y, kmeans_labels)
ri_agnes = rand_index(y, agnes_labels)

print(f"Rand Index (KMeans): {ri_kmeans:.4f}")
print(f"Rand Index (Agglomerative - Complete Linkage): {ri_agnes:.4f}")

if ri_kmeans > ri_agnes:
    print("✅ KMeans performed better based on Rand Index.")
else:
    print("✅ Agglomerative Clustering performed better based on Rand Index.")


Rand Index (KMeans): 0.5792
Rand Index (Agglomerative - Complete Linkage): 0.7377
✅ Agglomerative Clustering performed better based on Rand Index.
