# **Data Import**

Importing the necessary libraries and data

In [10]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [11]:
# Load the data
sales_data = pd.read_csv('data/sales_data.csv')
outlet_data = pd.read_csv('data/outlets_data.csv')
product_data = pd.read_csv('data/product_data.csv')
week_data = pd.read_csv('data/week_data.csv')
freezer_data = pd.read_csv('data/freezer_data.csv')

Let's try to have a breif look at these data

In [12]:
sales_data.head()

Unnamed: 0,Outlet_ID,week,pid,product_name,no_units
0,ID7203,1,IP1,Vanilla Mini Cone,874.0
1,ID7203,1,IP2,Chocolate Petite Bar,105.0
2,ID7203,1,IP3,Strawberry Tiny Cup,1198.0
3,ID7203,1,IP4,Mint Bite-size Pop,502.0
4,ID7203,1,IP5,Butter Pecan Small Stick,710.0


In [14]:
outlet_data.head()

Unnamed: 0,Outlet_ID,area(sqft)
0,ID7203,5660
1,ID1878,4138
2,ID1740,5546
3,ID4366,5134
4,ID8358,4100


In [15]:
product_data.head()

Unnamed: 0,pid,volume,product_name,price
0,IP1,0.2,Vanilla Mini Cone,100
1,IP2,0.2,Chocolate Petite Bar,90
2,IP3,0.2,Strawberry Tiny Cup,110
3,IP4,0.2,Mint Bite-size Pop,100
4,IP5,0.2,Butter Pecan Small Stick,100


In [16]:
week_data.head()

Unnamed: 0,Week,Start Date,End Date
0,Week 1,1/2/2023,1/8/2023
1,Week 2,1/9/2023,1/15/2023
2,Week 3,1/16/2023,1/22/2023
3,Week 4,1/23/2023,1/29/2023
4,Week 5,1/30/2023,2/5/2023


In [18]:
freezer_data.head()

Unnamed: 0,Freezer Name,Model Number,Volume Capacity (Liters),Power and maitainance Cost (LKR) per 100 hours
0,ChillMaster,M001,55,2500
1,FreezeZone,M002,75,2800
2,CoolTech,M003,120,3200
3,mini CoolTech,M004,30,1800
4,IceBlast Pro,M005,150,3500


# **Preprocess the Data**

Let's enrich the outlets_data with 
1.   Outlet Size
2.   Outlet Space Availability
3.   Outlet Sales
4.   Outlet Location

In [13]:
# Data preprocessing
sales_data = sales_data.merge(product_data, on='pid', how='left')
sales_data['total_volume'] = sales_data['no_units'] * sales_data['volume']
outlet_sales = sales_data.groupby('Outlet_ID')['total_volume'].sum().reset_index()
outlet_sales = outlet_sales.merge(outlet_data, on='Outlet_ID', how='left')

# Perform store segmentation
X = outlet_sales[['total_volume', 'area(sqft)']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Find the optimal number of clusters
scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    scores.append(silhouette_score(X_scaled, kmeans.labels_))

optimal_k = np.argmax(scores) + 2

# Fit the KMeans model with optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(X_scaled)
outlet_sales['cluster'] = kmeans.labels_

# Calculate metrics and recommend suitable freezer types
def calculate_roi_metrics(cluster_df):
    roi_metrics = []
    for _, row in freezer_data.iterrows():
        cluster_df['estimated_sales_volume'] = (cluster_df['total_volume'] * row['Volume Capacity (Liters)']) / cluster_df['area(sqft)']
        cluster_df['roi'] = cluster_df['estimated_sales_volume'] / (row['Power and maitainance Cost (LKR) per 100 hours'] + row['Volume Capacity (Liters)'])
        roi = cluster_df['roi'].mean()
        roi_metrics.append((row['Model Number'], roi))

    return sorted(roi_metrics, key=lambda x: x[1], reverse=True)[0]

recommended_freezers = outlet_sales.groupby('cluster').apply(calculate_roi_metrics).reset_index(name='recommended_freezer')
recommended_freezers = recommended_freezers.merge(freezer_data, left_on='recommended_freezer', right_on='Model Number', how='left')

# Evaluation metrics
inertia = kmeans.inertia_
silhouette = silhouette_score(X_scaled, kmeans.labels_)
davies_bouldin = davies_bouldin_score(X_scaled, kmeans.labels_)
calinski_harabasz = calinski_harabasz_score(X_scaled, kmeans.labels_)

print("Inertia:", inertia)
print("Silhouette Coefficient:", silhouette)
print("Davies-Bouldin Index:", davies_bouldin)
print("Calinski-Harabasz Index:", calinski_harabasz)
print("\nRecommended Freezers:")
print(recommended_freezers[['cluster', 'Freezer Name', 'Model Number']])




Inertia: 828.3149747453526
Silhouette Coefficient: 0.556478938441641
Davies-Bouldin Index: 0.7194786429426779
Calinski-Harabasz Index: 1366.2020332727245

Recommended Freezers:
   cluster Freezer Name Model Number
0        0          NaN          NaN
1        1          NaN          NaN


