In [1]:
import pandas as pd
import numpy as np
import datetime
import pickle
from datetime import datetime 
from sklearn.preprocessing import OneHotEncoder

In [2]:
with open('Models\kmeans_model.pkl','rb') as file:
    model=pickle.load(file)

with open('Models\onehot_encoder_product_prefreence.pkl','rb') as file:
    onehot_encoder_product=pickle.load(file)

with open('Models\scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [95]:
def preprocessing(df):
    
    df['membership_start_date'] = pd.to_datetime(df['membership_start_date'], dayfirst=True)
    df['last_purchase_date'] = pd.to_datetime(df['last_purchase_date'], dayfirst=True)

    # Clean text columns
    df['product_preference'] = df['product_preference'].str.lower().str.strip()
    df['product_segment'] = df['product_segment'].str.lower().str.strip()

    # Reference date for calculating Active_days and Avg_purchase_gap_days
    reference_date = df['last_purchase_date'].max()

    # Calculate Active_days
    df['Active_days'] = (reference_date - df['membership_start_date']).dt.days.round().astype(int)

    # Calculate Avg_purchase_gap_days with safe division
    df['Avg_purchase_gap_days'] = df.apply(
        lambda x: x['Active_days'] / x['store_visit_frequency'] if x['store_visit_frequency'] > 0 else x['Active_days'],
        axis=1
    )

    # Calculate Purchase_count
    df['Frequency'] = (df['purchase_frequency_per_month'] * ((df['last_purchase_date'] - df['membership_start_date']).dt.days / 30.0)).round().astype(int)

    # Calculate monetary value
    df['Monetary'] = df['Frequency'] * df['average_purchase_value']

    # Calculate Recency (days since last purchase relative to today)
    df['Recency'] = (datetime.today() - df['last_purchase_date']).dt.days

    # Rename columns as requested
    df.rename(columns={'average_purchase_value': 'Avg_monetary', 'product_segment': 'Product_category'}, inplace=True)

    data = df[['customer_id','product_preference','store_visit_frequency','Active_days','days_since_last_visit','Avg_purchase_gap_days','Recency','Monetary','Frequency','Avg_monetary']]

    return data

data = pd.read_csv('customer.csv')  
cleaned_data = preprocessing(data)

cleaned_data


Unnamed: 0,customer_id,product_preference,store_visit_frequency,Active_days,days_since_last_visit,Avg_purchase_gap_days,Recency,Monetary,Frequency,Avg_monetary
0,CUST1000,gym gear,17,1715,108,100.882353,578,37218.06,99,375.94
1,CUST1001,running shoes,20,110,107,5.500000,92,624.75,5,124.95
2,CUST1002,gym gear,10,491,55,49.100000,403,4298.84,28,153.53
3,CUST1003,supplements,1,1697,137,1697.000000,750,23879.66,118,202.37
4,CUST1004,training wear,18,694,17,38.555556,86,48731.58,171,284.98
...,...,...,...,...,...,...,...,...,...,...
995,CUST1995,running shoes,18,120,55,6.666667,39,2605.05,15,173.67
996,CUST1996,supplements,6,156,57,26.000000,11,1935.68,4,483.92
997,CUST1997,yoga accessories,9,1148,168,127.555556,774,46312.00,100,463.12
998,CUST1998,yoga accessories,12,1572,135,131.000000,195,41451.70,199,208.30


In [96]:
product_encoded = onehot_encoder_product.transform(cleaned_data[['product_preference']])
product_features = pd.DataFrame(product_encoded, columns=onehot_encoder_product.get_feature_names_out(['product_preference']))
product_features

Unnamed: 0,product_preference_gym gear,product_preference_running shoes,product_preference_supplements,product_preference_training wear,product_preference_yoga accessories
0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...
995,0.0,1.0,0.0,0.0,0.0
996,0.0,0.0,1.0,0.0,0.0
997,0.0,0.0,0.0,0.0,1.0
998,0.0,0.0,0.0,0.0,1.0


In [97]:
data=pd.concat([cleaned_data.drop('product_preference',axis=1),product_features],axis=1)
data

Unnamed: 0,customer_id,store_visit_frequency,Active_days,days_since_last_visit,Avg_purchase_gap_days,Recency,Monetary,Frequency,Avg_monetary,product_preference_gym gear,product_preference_running shoes,product_preference_supplements,product_preference_training wear,product_preference_yoga accessories
0,CUST1000,17,1715,108,100.882353,578,37218.06,99,375.94,1.0,0.0,0.0,0.0,0.0
1,CUST1001,20,110,107,5.500000,92,624.75,5,124.95,0.0,1.0,0.0,0.0,0.0
2,CUST1002,10,491,55,49.100000,403,4298.84,28,153.53,1.0,0.0,0.0,0.0,0.0
3,CUST1003,1,1697,137,1697.000000,750,23879.66,118,202.37,0.0,0.0,1.0,0.0,0.0
4,CUST1004,18,694,17,38.555556,86,48731.58,171,284.98,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,CUST1995,18,120,55,6.666667,39,2605.05,15,173.67,0.0,1.0,0.0,0.0,0.0
996,CUST1996,6,156,57,26.000000,11,1935.68,4,483.92,0.0,0.0,1.0,0.0,0.0
997,CUST1997,9,1148,168,127.555556,774,46312.00,100,463.12,0.0,0.0,0.0,0.0,1.0
998,CUST1998,12,1572,135,131.000000,195,41451.70,199,208.30,0.0,0.0,0.0,0.0,1.0


In [99]:
features = data.drop(columns=['customer_id'])

scaled_data = scaler.transform(features)
model.predict(scaled_data)
cleaned_data['cluster'] = model.labels_
cleaned_data

Unnamed: 0,customer_id,product_preference,store_visit_frequency,Active_days,days_since_last_visit,Avg_purchase_gap_days,Recency,Monetary,Frequency,Avg_monetary,cluster
0,CUST1000,gym gear,17,1715,108,100.882353,578,37218.06,99,375.94,1
1,CUST1001,running shoes,20,110,107,5.500000,92,624.75,5,124.95,0
2,CUST1002,gym gear,10,491,55,49.100000,403,4298.84,28,153.53,1
3,CUST1003,supplements,1,1697,137,1697.000000,750,23879.66,118,202.37,3
4,CUST1004,training wear,18,694,17,38.555556,86,48731.58,171,284.98,1
...,...,...,...,...,...,...,...,...,...,...,...
995,CUST1995,running shoes,18,120,55,6.666667,39,2605.05,15,173.67,0
996,CUST1996,supplements,6,156,57,26.000000,11,1935.68,4,483.92,3
997,CUST1997,yoga accessories,9,1148,168,127.555556,774,46312.00,100,463.12,2
998,CUST1998,yoga accessories,12,1572,135,131.000000,195,41451.70,199,208.30,2


In [100]:
agg = cleaned_data.groupby(['cluster']).agg({
    'Frequency': 'sum',
    'Monetary': 'sum'
}).rename(columns={
    'Frequency': 'Frequency',
    'Monetary': 'Monetary'
}).reset_index()

agg['Unit Price'] = (agg['Monetary'] / agg['Frequency'])
agg


Unnamed: 0,cluster,Frequency,Monetary,Unit Price
0,0,13383,3354460.77,250.650883
1,1,32834,8578452.29,261.267354
2,2,16579,4255496.14,256.679905
3,3,18592,5008885.44,269.410792
