In [9]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Navigate to project folder
project_folder = os.path.dirname(os.getcwd())

#Moving to data folder
data_folder = os.path.join(project_folder, 'data')
csv_file_path = os.path.join(data_folder, 'pre_processes_data.csv')

#Data readin
df = pd.read_csv(csv_file_path)
df.head()


Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Education,Marital_Status,Dt_Customer
0,5524.0,1957.0,58138.0,0.0,0.0,58.0,635.0,88.0,546.0,172.0,...,0.0,0.0,0.0,0.0,3.0,11.0,1.0,2,4,2012-09-04
1,2174.0,1954.0,46344.0,1.0,1.0,38.0,11.0,1.0,6.0,2.0,...,0.0,0.0,0.0,0.0,3.0,11.0,0.0,2,4,2014-03-08
2,4141.0,1965.0,71613.0,0.0,0.0,26.0,426.0,49.0,127.0,111.0,...,0.0,0.0,0.0,0.0,3.0,11.0,0.0,2,5,2013-08-21
3,6182.0,1984.0,26646.0,1.0,0.0,26.0,11.0,4.0,20.0,10.0,...,0.0,0.0,0.0,0.0,3.0,11.0,0.0,2,5,2014-02-10
4,5324.0,1981.0,58293.0,1.0,0.0,94.0,173.0,43.0,118.0,46.0,...,0.0,0.0,0.0,0.0,3.0,11.0,0.0,4,3,2014-01-19


In [3]:
df.columns

Index(['ID', 'Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency',
       'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
       'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
       'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Z_CostContact',
       'Z_Revenue', 'Response', 'Education', 'Marital_Status', 'Dt_Customer'],
      dtype='object')

# Customer Product Intrest

In [5]:
#Selecting the feature based on the own knoweldge because only for feature releated to the products purchase

features = df[['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
                'MntSweetProducts', 'MntGoldProds']]
features.head()

Unnamed: 0,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds
0,635.0,88.0,546.0,172.0,88.0,88.0
1,11.0,1.0,6.0,2.0,1.0,6.0
2,426.0,49.0,127.0,111.0,21.0,42.0
3,11.0,4.0,20.0,10.0,3.0,5.0
4,173.0,43.0,118.0,46.0,27.0,15.0


In [6]:
#Scaling the Data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data = scaler.fit_transform(features)


In [30]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#Finding the best K value
k_values = range(3, 5)  
best_k = 0
max_silhouette_score = -1

for k in k_values:

    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data)
    
    silhouette_avg = silhouette_score(data, kmeans.labels_)
    
    if silhouette_avg > max_silhouette_score:
        max_silhouette_score = silhouette_avg
        best_k = k

print(f"Best k: {best_k}")
print(f"Max Silhouette Score: {max_silhouette_score}")

Best k: 4
Max Silhouette Score: 0.4716977141167002


In [31]:
#Model fitting
model = KMeans(n_clusters=best_k , init='k-means++' ,random_state= 42)
model.fit(data)
features['Group'] = model.labels_
print("Number of clusters: ", features['Group'].nunique())
features.head()

Number of clusters:  4


Unnamed: 0,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,Group
0,635.0,88.0,546.0,172.0,88.0,88.0,3
1,11.0,1.0,6.0,2.0,1.0,6.0,1
2,426.0,49.0,127.0,111.0,21.0,42.0,2
3,11.0,4.0,20.0,10.0,3.0,5.0,1
4,173.0,43.0,118.0,46.0,27.0,15.0,1


In [32]:
#Summary
features.groupby('Group').mean()

Unnamed: 0_level_0,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,472.949495,26.747475,180.712121,33.565657,23.868687,143.954545
1,96.674547,6.35855,35.626478,8.903861,6.361702,17.328605
2,724.558442,35.831169,360.145455,53.290909,34.81039,39.774026
3,511.900293,90.181818,430.923754,129.903226,97.428152,89.912023


## Conclusion
 ### Group 0:
  * Moderate spending on wines (472.95) and meat products (180.71).
  * Relatively low spending on fruits (26.75), fish products (33.57), sweet products (23.87), and gold products (143.95).
  * Overall, this group seems to have a balanced spending pattern with a focus on wines and meat products.

 ### Group 1:
  * Lowest spending across all product categories.
  * Minimal spending on wines (96.67), fruits (6.36), meat products (35.63), fish products (8.90), sweet products (6.36), and gold products (17.33).
  * Indicates a frugal and minimalistic spending behavior compared to other groups.
  
 ### Group 2:
  * High spending on wines (724.56), meat products (360.15), and fish products (53.29).
  * Moderate spending on fruits (35.83), sweet products (34.81), and gold products (39.77).
  * This group appears to be enthusiasts in terms of spending on wines, meat, and fish products.

 ### Group 3:
  * Significant spending on wines (511.90), meat products (430.92), and fish products (129.90).
  * High spending on fruits (90.18), sweet products (97.43), and gold products (89.91).
  * This group represents customers with a diverse and relatively high spending pattern across all product categories.

Overall Summary:

* Group 2 stands out as high spenders across multiple categories, including wines, meat products, and fish products.
* Group 3 focuses on high spending on meat and fish products, with notable spending on fruits and sweets.
* Group 0 represents customers with a balanced spending pattern across various categories.
* Group 1 consists of low spenders across all categories.


In [33]:
#Saving the product intrest data in another csv
csv_file_path = os.path.join(data_folder, 'product_intrest.csv')
features.to_csv(csv_file_path , index= False)

# Prediciting the Behaviour group

In [45]:
# Feature selection 
data = df[['NumDealsPurchases','NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases','NumWebVisitsMonth',]]
data.head()

Unnamed: 0,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth
0,3.0,8.0,10.0,4.0,7.0
1,2.0,1.0,1.0,2.0,5.0
2,1.0,8.0,2.0,10.0,4.0
3,2.0,2.0,0.0,4.0,6.0
4,5.0,5.0,3.0,6.0,5.0


In [46]:
#Scaling the Data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [47]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#Finding the best K value
k_values = range(3, 5)  
best_k = 0
max_silhouette_score = -1

for k in k_values:

    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    
    silhouette_avg = silhouette_score(scaled_data, kmeans.labels_)
    
    if silhouette_avg > max_silhouette_score:
        max_silhouette_score = silhouette_avg
        best_k = k

print(f"Best k: {best_k}")
print(f"Max Silhouette Score: {max_silhouette_score}")

Best k: 3
Max Silhouette Score: 0.3806667159909972


In [48]:
#Model fitting
model = KMeans(n_clusters=best_k , init='k-means++' ,random_state= 42)
model.fit(scaled_data)
data['Group'] = model.labels_
print("Number of clusters: ", data['Group'].nunique())
data.head()

Number of clusters:  3


Unnamed: 0,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Group
0,3.0,8.0,10.0,4.0,7.0,2
1,2.0,1.0,1.0,2.0,5.0,1
2,1.0,8.0,2.0,10.0,4.0,0
3,2.0,2.0,0.0,4.0,6.0,1
4,5.0,5.0,3.0,6.0,5.0,2


In [49]:
#Summary
data.groupby('Group').mean()

Unnamed: 0_level_0,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.410587,4.88269,5.619456,8.44206,2.799714
1,1.881579,2.024291,0.557692,3.185223,6.406883
2,4.468379,7.0,2.733202,7.280632,6.644269


## Conclusion
  ### Group 0:
   * Moderate number of deals purchases (1.45) and store purchases (8.50).
   * Relatively high number of web purchases (5.19), catalog purchases (5.77), and web visits per month (2.95).
   * Best purchase behavior includes a balanced mix of online (web and catalog) and in-store purchases, with a focus on store purchases.

  ### Group 1:
   * Highest number of web visits per month (6.27) among all groups.
   * Moderate number of deals purchases (1.86), web purchases (2.04), and store purchases (3.27).
   * Lower engagement in catalog purchases (0.61).
   * Best purchase behavior involves a strong emphasis on online activities, particularly web visits, indicating a preference for online exploration and less reliance on in-store and catalog purchases.

  ### Group 2:
   * Highest number of deals purchases (4.41) and web purchases (6.69) among all groups.
   * High engagement in store purchases (7.31) and moderate catalog purchases (2.72).
   * Moderate number of web visits per month (6.49).
   * Best purchase behavior includes a diverse mix of purchases across all categories, with a notable emphasis on deals and web-based transactions.

    In summary, each customer group exhibits distinct purchase behaviors:

    * Group 0: Balanced mix of online and in-store purchases, with an emphasis on store purchases.
    * Group 1: Strong focus on online activities, especially web visits, with fewer catalog purchases.
    * Group 2: Diverse mix of purchases, with a strong emphasis on deals and web transactions.

In [51]:
#Saving the product intrest data in another csv
csv_file_path = os.path.join(data_folder, 'purchase_behaviour.csv')
data.to_csv(csv_file_path , index= False)