In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

In [30]:
# Load the dataset
file_path = "../data/shopping_trends.csv"  # Change to your file path
df = pd.read_csv(file_path)

In [31]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Payment Method,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Preferred Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Credit Card,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Bank Transfer,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Cash,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,PayPal,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Cash,Free Shipping,Yes,Yes,31,PayPal,Annually


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer ID               3900 non-null   int64  
 1   Age                       3900 non-null   int64  
 2   Gender                    3900 non-null   object 
 3   Item Purchased            3900 non-null   object 
 4   Category                  3900 non-null   object 
 5   Purchase Amount (USD)     3900 non-null   int64  
 6   Location                  3900 non-null   object 
 7   Size                      3900 non-null   object 
 8   Color                     3900 non-null   object 
 9   Season                    3900 non-null   object 
 10  Review Rating             3900 non-null   float64
 11  Subscription Status       3900 non-null   object 
 12  Payment Method            3900 non-null   object 
 13  Shipping Type             3900 non-null   object 
 14  Discount

In [33]:
# Drop irrelevant column
df.drop(columns="Customer ID", axis=1, inplace=True)

In [34]:
df["Subscription Status"].value_counts()

Subscription Status
No     2847
Yes    1053
Name: count, dtype: int64

## Preprocessing

## Numerical Columns

In [35]:
numerical_columns = [col for col in df.columns if df[col].dtype in ["int64", "float64"]]
numerical_columns

['Age', 'Purchase Amount (USD)', 'Review Rating', 'Previous Purchases']

In [36]:
# #unique values in cat_col...
# for col in numerical_columns:
#     print(f"Unique values in '{col}':")
#     print(df[col].unique())
#     print("-" * 75)

In [37]:
categorical_columns = [col for col in df.columns if df[col].dtype not in ["int64", "float64"]]
categorical_columns

['Gender',
 'Item Purchased',
 'Category',
 'Location',
 'Size',
 'Color',
 'Season',
 'Subscription Status',
 'Payment Method',
 'Shipping Type',
 'Discount Applied',
 'Promo Code Used',
 'Preferred Payment Method',
 'Frequency of Purchases']

In [38]:
# #unique values in cat_col...
# for col in categorical_columns:
#     print(f"Unique values in '{col}':")
#     print(df[col].unique())
#     print("-" * 75)

## Scaling Numerical Data..
reasons:
- Avoiding Feature Domination
- its imp for Gradient-based models, Distance-based models, and Models that use regularization(L1, L2)

In [39]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [40]:
df[numerical_columns] = sc.fit_transform(df[numerical_columns])
df[numerical_columns]

Unnamed: 0,Age,Purchase Amount (USD),Review Rating,Previous Purchases
0,0.718913,-0.285629,-0.907584,-0.785831
1,-1.648629,0.178852,-0.907584,-1.616552
2,0.390088,0.558882,-0.907584,-0.162789
3,-1.517099,1.276716,-0.349027,1.637107
4,0.061263,-0.454531,-1.466141,0.391025
...,...,...,...,...
3895,-0.267563,-1.341267,0.628448,0.460252
3896,0.521618,-0.454531,1.047366,1.083293
3897,0.127028,-1.130139,-1.186862,-0.093563
3898,-0.004502,0.727784,0.069891,-0.093563


## Categorical Data Encoding
- converting categorical data into numerical..
  

In [41]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [42]:
le = LabelEncoder()
ohe = OneHotEncoder(drop='first') #to remove multicoliniarity 

In [43]:
#Applying Label Encoding...
df['Gender'] = le.fit_transform(df['Gender'])
df['Subscription Status'] = le.fit_transform(df['Subscription Status'])
df['Promo Code Used'] = le.fit_transform(df['Promo Code Used'])
df['Discount Applied'] = le.fit_transform(df['Discount Applied'])

In [44]:
# Applying OneHotEncoding..
encoded_columns = ohe.fit_transform(df[['Item Purchased', 'Category', 'Location', 'Size', 'Color', 
                                        'Season', 'Payment Method', 'Shipping Type', 
                                        'Preferred Payment Method','Frequency of Purchases']]).toarray()

# Converting the encoded columns into a DataFrame...
encoded_df = pd.DataFrame(encoded_columns, columns=ohe.get_feature_names_out())
transformed_df = pd.concat([df, encoded_df], axis=1)

In [45]:
transformed_df

Unnamed: 0,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,...,Preferred Payment Method_Credit Card,Preferred Payment Method_Debit Card,Preferred Payment Method_PayPal,Preferred Payment Method_Venmo,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,0.718913,1,Blouse,Clothing,-0.285629,Kentucky,L,Gray,Winter,-0.907584,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-1.648629,1,Sweater,Clothing,0.178852,Maine,L,Maroon,Winter,-0.907584,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.390088,1,Jeans,Clothing,0.558882,Massachusetts,S,Maroon,Spring,-0.907584,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.517099,1,Sandals,Footwear,1.276716,Rhode Island,M,Maroon,Spring,-0.349027,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.061263,1,Blouse,Clothing,-0.454531,Oregon,M,Turquoise,Spring,-1.466141,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,-0.267563,0,Hoodie,Clothing,-1.341267,Virginia,L,Turquoise,Summer,0.628448,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3896,0.521618,0,Backpack,Accessories,-0.454531,Iowa,L,White,Spring,1.047366,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3897,0.127028,0,Belt,Accessories,-1.130139,New Jersey,L,Green,Spring,-1.186862,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3898,-0.004502,0,Shoes,Footwear,0.727784,Minnesota,S,Brown,Summer,0.069891,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [46]:
transformed_df.drop(columns=['Item Purchased', 'Category', 'Location', 'Size', 'Color','Season', 'Payment Method', 'Shipping Type','Preferred Payment Method','Frequency of Purchases'], axis=1, inplace=True)

In [47]:
transformed_df

Unnamed: 0,Age,Gender,Purchase Amount (USD),Review Rating,Subscription Status,Discount Applied,Promo Code Used,Previous Purchases,Item Purchased_Belt,Item Purchased_Blouse,...,Preferred Payment Method_Credit Card,Preferred Payment Method_Debit Card,Preferred Payment Method_PayPal,Preferred Payment Method_Venmo,Frequency of Purchases_Bi-Weekly,Frequency of Purchases_Every 3 Months,Frequency of Purchases_Fortnightly,Frequency of Purchases_Monthly,Frequency of Purchases_Quarterly,Frequency of Purchases_Weekly
0,0.718913,1,-0.285629,-0.907584,1,1,1,-0.785831,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-1.648629,1,0.178852,-0.907584,1,1,1,-1.616552,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.390088,1,0.558882,-0.907584,1,1,1,-0.162789,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.517099,1,1.276716,-0.349027,1,1,1,1.637107,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.061263,1,-0.454531,-1.466141,1,1,1,0.391025,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,-0.267563,0,-1.341267,0.628448,0,0,0,0.460252,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3896,0.521618,0,-0.454531,1.047366,0,0,0,1.083293,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3897,0.127028,0,-1.130139,-1.186862,0,0,0,-0.093563,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3898,-0.004502,0,0.727784,0.069891,0,0,0,-0.093563,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [48]:
# input features and target...
X = transformed_df.drop(columns="Subscription Status", axis=1)
y = df["Subscription Status"]

In [49]:
# Correlation analysis...for reducing multicolinearity...
corr_matrix = X.corr()
#print(corr_matrix)
threshold = 0.5
highly_correlated_features = [
    corr_matrix.columns[j] 
    for i in range(len(corr_matrix.columns)) 
    for j in range(i + 1, len(corr_matrix.columns)) 
    if abs(corr_matrix.iloc[i, j]) > threshold
]
#highly_correlated_features = list(set(highly_correlated_features))
print(highly_correlated_features)

# highly_correlated_features = [corr_matrix.columns[j] for i in range(len(corr_matrix.columns)) for j in range(i + 1, len(corr_matrix.columns)) if abs(corr_matrix.iloc[i, j]) > threshold]
# print(len(highly_correlated_features))

# Droping highly correlated features....
X.drop(columns=highly_correlated_features, inplace=True, errors='ignore')



['Discount Applied', 'Promo Code Used', 'Promo Code Used', 'Category_Outerwear', 'Category_Outerwear']


In [50]:
# Split data...
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [52]:
# models...
models = {
    "Random Forest": RandomForestClassifier(class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(class_weight='balanced'),
    "AdaBoost Classifier": AdaBoostClassifier()
}



In [63]:
# Training and evaluation...
results = {}
cm = {}
for name, model in models.items():
    model.fit(X_train, y_train)  
    y_pred = model.predict(X_test)  
    accuracy = accuracy_score(y_test, y_pred)
    # cm = confusion_matrix(y_test, y_pred)
    # print(f"{name} = \n {cm}")
    results[name] = accuracy



In [56]:
# Display results
results_df = pd.DataFrame(list(results.items()), columns=["Model", "Accuracy"])
results_df.sort_values(by="Accuracy", ascending=False, inplace=True)
print(results_df)

                 Model  Accuracy
4  AdaBoost Classifier  0.729487
2    Gradient Boosting  0.717949
0        Random Forest  0.716667
1        Decision Tree  0.685897
3  Logistic Regression  0.602564


In [57]:
cm

array([[569,   0],
       [211,   0]])