In [43]:
import pandas as pd 
import numpy as np
import plotly.express as px
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import r2_score,mean_squared_error,make_scorer
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_regression,RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [2]:
df=pd.read_csv('black_friday.csv',index_col=[0])

# EDA

In [3]:
df.sample(20)

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1004979,P00313142,M,36-45,2,B,1,1,8,,,7944
1002436,P00127242,M,26-35,7,B,2,0,1,16.0,,15255
1002947,P00337942,F,26-35,0,A,1,1,5,,,5257
1003724,P00113742,M,36-45,6,B,4+,0,5,14.0,,8676
1000303,P00051442,M,26-35,7,C,1,1,8,17.0,,10071
1002659,P00202242,M,36-45,14,B,4+,1,10,16.0,,19300
1002685,P00119742,M,46-50,7,C,2,1,5,8.0,12.0,8636
1000839,P00127942,M,26-35,0,A,2,0,1,2.0,5.0,15297
1002223,P00033042,M,26-35,2,B,0,1,5,14.0,,7172
1001451,P00044342,M,36-45,20,B,0,0,8,,,9796


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 1000001 to 1006039
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  550068 non-null  object 
 1   Gender                      550068 non-null  object 
 2   Age                         550068 non-null  object 
 3   Occupation                  550068 non-null  int64  
 4   City_Category               550068 non-null  object 
 5   Stay_In_Current_City_Years  550068 non-null  object 
 6   Marital_Status              550068 non-null  int64  
 7   Product_Category_1          550068 non-null  int64  
 8   Product_Category_2          376430 non-null  float64
 9   Product_Category_3          166821 non-null  float64
 10  Purchase                    550068 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 50.4+ MB


In [5]:
df['Stay_In_Current_City_Years'].value_counts()

1     193821
2     101838
3      95285
4+     84726
0      74398
Name: Stay_In_Current_City_Years, dtype: int64

In [6]:
df['Product_ID'].unique()

array(['P00069042', 'P00248942', 'P00087842', ..., 'P00370293',
       'P00371644', 'P00370853'], dtype=object)

In [7]:
df['Gender'].unique()

array(['F', 'M'], dtype=object)

In [8]:
df['Occupation'].unique()

array([10, 16, 15,  7, 20,  9,  1, 12, 17,  0,  3,  4, 11,  8, 19,  2, 18,
        5, 14, 13,  6], dtype=int64)

In [9]:
df['City_Category'].unique()

array(['A', 'C', 'B'], dtype=object)

In [10]:
df['Marital_Status'].unique()

array([0, 1], dtype=int64)

In [11]:
df['Product_Category_1'].unique()

array([ 3,  1, 12,  8,  5,  4,  2,  6, 14, 11, 13, 15,  7, 16, 18, 10, 17,
        9, 20, 19], dtype=int64)

In [12]:
df['Product_Category_2'].unique()

array([nan,  6., 14.,  2.,  8., 15., 16., 11.,  5.,  3.,  4., 12.,  9.,
       10., 17., 13.,  7., 18.])

In [13]:
df['Product_Category_3'].unique()

array([nan, 14., 17.,  5.,  4., 16., 15.,  8.,  9., 13.,  6., 12.,  3.,
       18., 11., 10.])

In [14]:
df['Purchase'].unique()

array([ 8370, 15200,  1422, ...,   135,   123,   613], dtype=int64)

# Cleaning Data:

## Fix Product ID

In [15]:
df['Product_ID']=df['Product_ID'].apply(lambda x :  x[1:])

In [16]:
df['Product_ID']=pd.to_numeric(df['Product_ID'],errors='coerce')

In [17]:
df

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000001,69042,F,0-17,10,A,2,0,3,,,8370
1000001,248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
1000001,87842,F,0-17,10,A,2,0,12,,,1422
1000001,85442,F,0-17,10,A,2,0,12,14.0,,1057
1000002,285442,M,55+,16,C,4+,0,8,,,7969
...,...,...,...,...,...,...,...,...,...,...,...
1006033,372445,M,51-55,13,B,1,1,20,,,368
1006035,375436,F,26-35,1,C,3,0,20,,,371
1006036,375436,F,26-35,15,B,4+,1,20,,,137
1006038,375436,F,55+,1,C,2,0,20,,,365


## Fix stay in current city years 

In [18]:
df['Stay_In_Current_City_Years'].replace('4+',4,inplace=True)

In [19]:
df['Stay_In_Current_City_Years'].value_counts()

1    193821
2    101838
3     95285
4     84726
0     74398
Name: Stay_In_Current_City_Years, dtype: int64

## Fix Age

In [20]:
df['Age'].unique()

array(['0-17', '55+', '26-35', '46-50', '51-55', '36-45', '18-25'],
      dtype=object)

In [21]:
def age_converter(age):
    if age == '55+':
        return 60
    else:
        return eval(age.replace('-','+'))/2

In [22]:
df['Age']=df['Age'].apply(age_converter)

In [23]:
df['Age'].value_counts()

30.5    219587
40.5    110013
21.5     99660
48.0     45701
53.0     38501
60.0     21504
8.5      15102
Name: Age, dtype: int64

In [24]:
df['Stay_In_Current_City_Years']=pd.to_numeric(df['Stay_In_Current_City_Years'],errors='coerce')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 1000001 to 1006039
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  550068 non-null  int64  
 1   Gender                      550068 non-null  object 
 2   Age                         550068 non-null  float64
 3   Occupation                  550068 non-null  int64  
 4   City_Category               550068 non-null  object 
 5   Stay_In_Current_City_Years  550068 non-null  int64  
 6   Marital_Status              550068 non-null  int64  
 7   Product_Category_1          550068 non-null  int64  
 8   Product_Category_2          376430 non-null  float64
 9   Product_Category_3          166821 non-null  float64
 10  Purchase                    550068 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 50.4+ MB


In [26]:
df.sample(18)

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000131,299242,M,21.5,4,C,4,0,8,,,7908
1000934,26842,F,40.5,7,A,0,0,5,,,1749
1000629,316842,F,8.5,10,C,2,0,4,5.0,,2174
1004422,257342,M,30.5,5,A,1,1,8,,,5839
1001825,213742,M,30.5,7,C,4,1,8,,,9759
1001650,64242,M,30.5,19,C,1,0,5,,,5397
1004740,85842,F,30.5,3,C,3,0,12,,,1380
1004155,127742,M,53.0,17,C,1,0,1,2.0,15.0,15784
1001733,171342,M,21.5,14,B,0,1,13,16.0,,564
1001354,317142,F,30.5,2,B,3,0,2,15.0,,6776


# Handling missing Values

In [27]:
df[df['Product_ID']=='00050242']

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [28]:
df[df['Product_Category_2'].isna()]['Product_ID']

User_ID
1000001     69042
1000001     87842
1000002    285442
1000005    274942
1000005     14542
            ...  
1006033    372445
1006035    375436
1006036    375436
1006038    375436
1006039    371644
Name: Product_ID, Length: 173638, dtype: int64

## Handle Product 2,3

In [29]:
pip install knnimpute

Note: you may need to restart the kernel to use updated packages.


In [30]:
IDS=1
#for ID in df['Product_ID'].unique():
    #df['Product_ID']=df['Product_ID'].replace(ID,IDS)
    #IDS += 1

In [31]:
df['Product_ID'].value_counts()

265242    1880
25442     1615
110742    1612
112142    1562
57642     1470
          ... 
314842       1
298842       1
231642       1
204442       1
66342        1
Name: Product_ID, Length: 3631, dtype: int64

In [32]:
#from sklearn.impute import KNNImputer
#imputer=KNNImputer(n_neighbors=1)

In [33]:
#nan_pro=['Product_Category_2', 'Product_Category_3']
#for col in nan_pro:
    #imputer.fit(df[['Product_ID',col]])
    #df[['Product_ID',col]]=imputer.fit_transform(df[['Product_ID',col]])

In [34]:
df['Product_Category_2'].fillna(0,inplace=True)
df['Product_Category_3'].fillna(0,inplace=True)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 1000001 to 1006039
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  550068 non-null  int64  
 1   Gender                      550068 non-null  object 
 2   Age                         550068 non-null  float64
 3   Occupation                  550068 non-null  int64  
 4   City_Category               550068 non-null  object 
 5   Stay_In_Current_City_Years  550068 non-null  int64  
 6   Marital_Status              550068 non-null  int64  
 7   Product_Category_1          550068 non-null  int64  
 8   Product_Category_2          550068 non-null  float64
 9   Product_Category_3          550068 non-null  float64
 10  Purchase                    550068 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 50.4+ MB


In [36]:
df.sample(15)

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000166,72942,M,21.5,4,B,1,1,8,0.0,0.0,10017
1001059,34342,F,30.5,0,B,3,0,5,0.0,0.0,5405
1002211,57442,M,48.0,6,C,2,1,5,14.0,16.0,7126
1001503,210042,M,30.5,12,A,2,0,8,14.0,0.0,6150
1001941,254842,M,40.5,17,A,1,0,5,13.0,0.0,5373
1000344,277642,M,40.5,14,C,1,1,2,3.0,10.0,6615
1003387,270242,M,21.5,1,B,3,1,8,0.0,0.0,9867
1004887,52842,M,21.5,2,B,1,1,10,15.0,0.0,14316
1005371,289042,M,30.5,11,A,0,0,8,0.0,0.0,7822
1004085,214442,F,30.5,6,A,1,0,8,0.0,0.0,9905


# Feature Engineering

## Handle City Category

In [37]:
C_C={'C':1,'B':2,'A':1}
df['City_Category']=df['City_Category'].map(C_C)

## Handle Gender

In [38]:
Gen={'M':1,'F':2}
df['Gender']=df['Gender'].map(Gen)

In [39]:
df.sample(20)

Unnamed: 0_level_0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1002453,144642,1,30.5,7,2,0,0,1,15.0,16.0,11922
1001457,46242,1,30.5,1,2,1,1,8,0.0,0.0,9998
1000157,71442,1,40.5,16,1,1,0,15,17.0,0.0,21186
1004016,127242,1,53.0,0,2,1,0,1,16.0,0.0,11688
1000149,32542,1,30.5,1,2,2,1,8,0.0,0.0,3885
1002038,109542,2,48.0,1,2,3,1,8,14.0,17.0,7943
1002380,135742,1,30.5,4,1,0,1,6,8.0,0.0,12134
1001699,105642,2,30.5,19,1,0,0,8,0.0,0.0,8124
1002127,370853,1,21.5,4,1,0,1,19,0.0,0.0,37
1004482,128242,2,40.5,1,1,1,0,5,12.0,14.0,6904


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 550068 entries, 1000001 to 1006039
Data columns (total 11 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Product_ID                  550068 non-null  int64  
 1   Gender                      550068 non-null  int64  
 2   Age                         550068 non-null  float64
 3   Occupation                  550068 non-null  int64  
 4   City_Category               550068 non-null  int64  
 5   Stay_In_Current_City_Years  550068 non-null  int64  
 6   Marital_Status              550068 non-null  int64  
 7   Product_Category_1          550068 non-null  int64  
 8   Product_Category_2          550068 non-null  float64
 9   Product_Category_3          550068 non-null  float64
 10  Purchase                    550068 non-null  int64  
dtypes: float64(3), int64(8)
memory usage: 50.4 MB


In [42]:
## split it into x--> data,  y--> target
X = df.drop('Purchase', axis =1)
y = df['Purchase']

In [44]:
## Models 
models = [
    ('Linear Regression', LinearRegression()), 
    ('DTR', DecisionTreeRegressor()), 
    ('RFR', RandomForestRegressor()), 
    ('ADA', AdaBoostRegressor()), 
    ('GBR', GradientBoostingRegressor()), 
    ('XGBR', XGBRegressor())
]

In [45]:
# Define the numerical column
numerical_cols = X.select_dtypes(include='number').columns


# Initialize RFE feature selector
RFE_selector = RFE(XGBRegressor(), n_features_to_select=6) 

# Create the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[('num', RobustScaler(), numerical_cols), 
                                              ],remainder='passthrough')


for model_name, model in models:
    # Print model name
    print(f"For {model_name}:")

    # Steps Creation
    steps = list()
    steps.append(('preprocessor', preprocessor))

    # Choose the feature selector based on your preference
    steps.append(('feature_selector', RFE_selector))  # RFE feature selection
    
    steps.append((model_name, model))

    # Create the pipeline
    pipeline = Pipeline(steps=steps)

    ## Scoring
    # Perform cross-validation
    # Example evaluation metrics
    cv_results = cross_validate(pipeline, X, y, scoring='r2', cv=5, return_train_score=True)  # X and y are your input data and targets



    # Access the results
    print("test :", cv_results['test_score'].mean())
    print("train:", cv_results['train_score'].mean())
    print('*' * 50)

For Linear Regression:
test : 0.1497402087021043
train: 0.15247339230024112
**************************************************
For DTR:
test : 0.6829282204185866
train: 0.7585531363204305
**************************************************
For RFR:
test : 0.6974521515877096
train: 0.7562397617139697
**************************************************
For ADA:
test : 0.4294075160363732
train: 0.4432304448949843
**************************************************
For GBR:
test : 0.6585020529312718
train: 0.6617020377036302
**************************************************
For XGBR:
test : 0.7099445614802885
train: 0.7178330924357222
**************************************************


In [48]:
# define the parameter grid
param_grid = {
    'XGBR__max_depth': [2, 4, 6],
    'XGBR__learning_rate': [0.1, 0.01],
    'XGBR__n_estimators': [50, 100, 200]
}

In [49]:
steps=[]
steps.append(('preprocessor', preprocessor))
steps.append(('feature_selector', RFE_selector))
steps.append(('XGBR',XGBRegressor()))
pipeline=Pipeline(steps=steps)

In [51]:
# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, return_train_score=True)

# Fit the pipeline with GridSearch to the data
grid_search.fit(X, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'XGBR__learning_rate': 0.1, 'XGBR__max_depth': 6, 'XGBR__n_estimators': 200}
Best Score: 0.7045915671415646


In [52]:
# Get the mean test score and mean train score for the best estimator
mean_test_score = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]
mean_train_score = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]

print("Mean Test Score:", mean_test_score)
print("Mean Train Score:", mean_train_score)

Mean Test Score: 0.7045915671415646
Mean Train Score: 0.7110016501890843


In [53]:
final_model=grid_search.best_estimator_
final_model