In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
import pickle


In [3]:
df = pd.read_csv('walmart_without_outliers.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539096 entries, 0 to 539095
Data columns (total 20 columns):
 #   Column                         Non-Null Count   Dtype
---  ------                         --------------   -----
 0   Gender                         539096 non-null  int64
 1   Occupation                     539096 non-null  int64
 2   Marital_Status                 539096 non-null  int64
 3   Product_Category               539096 non-null  int64
 4   Purchase                       539096 non-null  int64
 5   Age_0-17                       539096 non-null  bool 
 6   Age_18-25                      539096 non-null  bool 
 7   Age_26-35                      539096 non-null  bool 
 8   Age_36-45                      539096 non-null  bool 
 9   Age_46-50                      539096 non-null  bool 
 10  Age_51-55                      539096 non-null  bool 
 11  Age_55+                        539096 non-null  bool 
 12  City_Category_A                539096 non-null  bool 
 13 

In [4]:
X = df.drop(['Purchase'], axis=1)
Y = df['Purchase']

In [4]:
#linear regression:
scalar = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

In [15]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")

R2 Score: 0.14629639214130863


In [19]:
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"R2 Score: {r2}")

R2 Score: 0.6325737107071696


In [20]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')


MSE: 8542203.532658158
RMSE: 2922.7048316000296
MAE: 2158.403213797119
R² Score: 0.6325737107071696


In [22]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')



MSE: 8395046.21726044
RMSE: 2897.4206144880727
MAE: 2148.98326354973
R² Score: 0.6389033967339848


In [23]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')



MSE: 8392462.65109153
RMSE: 2896.9747411897692
MAE: 2148.713407266875
R² Score: 0.639014523813427


In [24]:
result = rf.score(X_train,y_train)
print(result)

0.6727717678747155


In [25]:
result = rf.score(X_test,y_test)
print(result)

0.639014523813427


In [43]:
#setting hyperparameter:
depth = 10
number_of_trees = 1000
treeStructure = DecisionTreeRegressor(max_depth=depth)
adaRegressor = AdaBoostRegressor(estimator=treeStructure, n_estimators=number_of_trees, random_state=42)
adaRegressor.fit(X_train, y_train)


In [45]:
y_pred = adaRegressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')


MSE: 9396437.692774916
RMSE: 3065.3609400484825
MAE: 2350.962578873093
R² Score: 0.595830487903017


In [50]:
xgbModel = xgb.XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.001, random_state=42)
xgbModel.fit(X_train,y_train)
y_pred = xgbModel.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')


MSE: 10658286.141915003
RMSE: 3264.7030710180984
MAE: 2490.585360565497
R² Score: 0.541554528363432


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539096 entries, 0 to 539095
Data columns (total 20 columns):
 #   Column                         Non-Null Count   Dtype
---  ------                         --------------   -----
 0   Gender                         539096 non-null  int64
 1   Occupation                     539096 non-null  int64
 2   Marital_Status                 539096 non-null  int64
 3   Product_Category               539096 non-null  int64
 4   Purchase                       539096 non-null  int64
 5   Age_0-17                       539096 non-null  bool 
 6   Age_18-25                      539096 non-null  bool 
 7   Age_26-35                      539096 non-null  bool 
 8   Age_36-45                      539096 non-null  bool 
 9   Age_46-50                      539096 non-null  bool 
 10  Age_51-55                      539096 non-null  bool 
 11  Age_55+                        539096 non-null  bool 
 12  City_Category_A                539096 non-null  bool 
 13 

In [16]:
#checking whether it will work or not
model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=10, random_seed=42, verbose=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')


0:	learn: 4535.3809488	total: 259ms	remaining: 4m 18s
100:	learn: 2917.6666884	total: 5.37s	remaining: 47.8s
200:	learn: 2878.6698153	total: 10.7s	remaining: 42.5s
300:	learn: 2856.3347066	total: 17.4s	remaining: 40.3s
400:	learn: 2842.1625652	total: 26.1s	remaining: 38.9s
500:	learn: 2832.1803022	total: 35s	remaining: 34.9s
600:	learn: 2824.5081165	total: 43.8s	remaining: 29.1s
700:	learn: 2818.2819183	total: 53s	remaining: 22.6s
800:	learn: 2812.8832663	total: 1m 2s	remaining: 15.4s
900:	learn: 2808.8750099	total: 1m 20s	remaining: 8.81s
999:	learn: 2805.3439639	total: 1m 28s	remaining: 0us
MSE: 8231036.166154045
RMSE: 2868.978244280365
MAE: 2143.4507211361492
R² Score: 0.6459579704460676


In [29]:
weight = [0,0,0,0,0,0]

In [30]:
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
weight[0] = tree_model.score(X_train, y_train)
print("Done")
depth = 10
number_of_trees = 200
treeStructure = DecisionTreeRegressor(max_depth=depth)
adaRegressor = AdaBoostRegressor(estimator=treeStructure, n_estimators=number_of_trees, random_state=42)
adaRegressor.fit(X_train, y_train)
weight[1] = adaRegressor.score(X_train,y_train)
print("Done")
xgbModel = xgb.XGBRegressor(n_estimators=number_of_trees, max_depth=depth, learning_rate=0.01, random_state=42)
xgbModel.fit(X_train,y_train)
weight[2] = xgbModel.score(X_train,y_train)
print("Done")
randomForest = RandomForestRegressor(n_estimators=number_of_trees, random_state=42)
randomForest.fit(X_train, y_train)
weight[3] = randomForest.score(X_train,y_train)
print("Done")
extraTreesModel = ExtraTreesRegressor(n_estimators=number_of_trees, random_state=42)
extraTreesModel.fit(X_train, y_train)
weight[4] = extraTreesModel.score(X_train,y_train)
print("Done")
catboost_model = CatBoostRegressor(iterations=number_of_trees, learning_rate=0.1, depth=depth, random_seed=42)
catboost_model.fit(X_train, y_train)
weight[5] = catboost_model.score(X_train, y_train)
print("Done")


Done
Done
Done
Done
Done
0:	learn: 4535.3809488	total: 61.2ms	remaining: 12.2s
1:	learn: 4290.3894697	total: 98.7ms	remaining: 9.77s
2:	learn: 4080.9305429	total: 132ms	remaining: 8.7s
3:	learn: 3905.2962907	total: 165ms	remaining: 8.06s
4:	learn: 3755.8135102	total: 199ms	remaining: 7.78s
5:	learn: 3626.6740985	total: 243ms	remaining: 7.85s
6:	learn: 3518.5942366	total: 297ms	remaining: 8.18s
7:	learn: 3428.9399679	total: 368ms	remaining: 8.82s
8:	learn: 3355.2324936	total: 420ms	remaining: 8.92s
9:	learn: 3292.1965001	total: 474ms	remaining: 9s
10:	learn: 3239.9548100	total: 523ms	remaining: 8.98s
11:	learn: 3195.4591909	total: 572ms	remaining: 8.97s
12:	learn: 3158.3488702	total: 619ms	remaining: 8.9s
13:	learn: 3128.6296493	total: 665ms	remaining: 8.84s
14:	learn: 3104.3646095	total: 712ms	remaining: 8.78s
15:	learn: 3084.6913488	total: 767ms	remaining: 8.82s
16:	learn: 3067.3608664	total: 815ms	remaining: 8.78s
17:	learn: 3053.7088837	total: 864ms	remaining: 8.74s
18:	learn: 3044.

In [31]:
weight = [0,0,0,0,0,0]
weight[0] = tree_model.score(X_train, y_train)
weight[1] = adaRegressor.score(X_train,y_train)
weight[2] = xgbModel.score(X_train,y_train)
weight[3] = randomForest.score(X_train,y_train)
weight[4] = extraTreesModel.score(X_train,y_train)
weight[5] = catboost_model.score(X_train, y_train)


In [32]:
print(weight)

[0.6738113454347346, 0.5991527701497026, 0.6220598438804594, 0.6727717678747155, 0.6738113453990435, 0.6433270969217255]


In [33]:
total = np.sum(weight)
normalised_weight = weight
for i in range(len(weight)):
    normalised_weight[i] = weight[i] /total 
    
print(np.sum(normalised_weight))

1.0000000000000002


Saving all the files so no need to rerun

In [34]:
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(tree_model, f)

with open('adaboost_model.pkl', 'wb') as f:
    pickle.dump(adaRegressor, f)

with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgbModel, f)

with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(randomForest, f)

with open('normalized_weights.pkl', 'wb') as f:
    pickle.dump(weight, f)

with open('extraTreeRegressor.pkl', 'wb') as f:
    pickle.dump(extraTreesModel, f)

with open('CatBoostRegressor.pkl', 'wb') as f:
    pickle.dump(catboost_model, f)
      
    
print("Models saved successfully using pickle!")

Models saved successfully using pickle!


In [35]:
with open('decision_tree_model.pkl', 'rb') as f:
    tree_model_loaded = pickle.load(f)

with open('adaboost_model.pkl', 'rb') as f:
    adaRegressor_loaded = pickle.load(f)

with open('xgboost_model.pkl', 'rb') as f:
    xgbModel_loaded = pickle.load(f)

with open('random_forest_model.pkl', 'rb') as f:
    randomForest_loaded = pickle.load(f)

with open('normalized_weights.pkl', 'rb') as f:
    weights_loaded = pickle.load(f)

with open('extraTreeRegressor.pkl', 'rb') as f:
    extraTrees_loaded = pickle.load(f)
    
with open('CatBoostRegressor.pkl', 'rb') as f:
    catBoost_loaded = pickle.load(f)
    
print("Models loaded successfully using pickle!")


Models loaded successfully using pickle!


Evaluating the model

In [38]:
def predict(X_test):
    y_pred_0 = tree_model_loaded.predict(X_test) * weights_loaded[0]
    y_pred_1 = adaRegressor_loaded.predict(X_test) * weights_loaded[1]
    y_pred_2 = xgbModel_loaded.predict(X_test) * weights_loaded[2]
    y_pred_3 = randomForest_loaded.predict(X_test) * weights_loaded[3]
    y_pred_4 = extraTrees_loaded.predict(X_test) * weights_loaded[4]
    y_pred_5 = catBoost_loaded.predict(X_test) * weights_loaded[5]
    y_pred = y_pred_0+ y_pred_1 + y_pred_2 + y_pred_3 + y_pred_4 + y_pred_5
    return y_pred


In [39]:
y_pred = predict(X_test= X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')


MSE: 8314520.158440423
RMSE: 2883.4909672895496
MAE: 2171.2751017270566
R² Score: 0.6423670687093128


Applying K-Fold


In [7]:
k = 5

kf = KFold(n_splits=k, shuffle=True, random_state=42)

mse_list, rmse_list, mae_list, r2_list = [], [], [], []
i = 1
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]


    tree_model = DecisionTreeRegressor(random_state=42)
    tree_model.fit(X_train, y_train)
    adaRegressor = AdaBoostRegressor(estimator=tree_model, n_estimators=200, random_state=42)
    adaRegressor.fit(X_train, y_train)
    xgbModel = xgb.XGBRegressor(n_estimators=200, max_depth=10, learning_rate=0.01, random_state=42)
    xgbModel.fit(X_train, y_train)
    randomForest = RandomForestRegressor(n_estimators=200, random_state=42)
    randomForest.fit(X_train, y_train)
    extraTreesModel = ExtraTreesRegressor(n_estimators=200, random_state=42)
    extraTreesModel.fit(X_train, y_train)
    catboost_model = CatBoostRegressor(iterations=200, learning_rate=0.1, depth=10, random_seed=42, verbose=0)
    catboost_model.fit(X_train, y_train)
    print(f'Model Training _ done .. {i}' )
    weight = [
        tree_model.score(X_train, y_train),
        adaRegressor.score(X_train, y_train),
        xgbModel.score(X_train, y_train),
        randomForest.score(X_train, y_train),
        extraTreesModel.score(X_train, y_train),
        catboost_model.score(X_train, y_train)
    ]
    
    weight_sum = sum(weight)
    normalized_weights = [w / weight_sum for w in weight]

    y_pred = (
        tree_model.predict(X_test) * normalized_weights[0] +
        adaRegressor.predict(X_test) * normalized_weights[1] +
        xgbModel.predict(X_test) * normalized_weights[2] +
        randomForest.predict(X_test) * normalized_weights[3] +
        extraTreesModel.predict(X_test) * normalized_weights[4] +
        catboost_model.predict(X_test) * normalized_weights[5]
    )
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mse_list.append(mse)
    rmse_list.append(rmse)
    mae_list.append(mae)
    r2_list.append(r2)
    i+=1

avg_mse = np.mean(mse_list)
avg_rmse = np.mean(rmse_list)
avg_mae = np.mean(mae_list)
avg_r2 = np.mean(r2_list)

print(f'Average MSE across {k} folds: {avg_mse}')
print(f'Average RMSE across {k} folds: {avg_rmse}')
print(f'Average MAE across {k} folds: {avg_mae}')
print(f'Average R² Score across {k} folds: {avg_r2}')

Model Training _ done .. 1
Model Training _ done .. 2
Model Training _ done .. 3
Model Training _ done .. 4
Model Training _ done .. 5
Average MSE across 5 folds: 8370061.140811068
Average RMSE across 5 folds: 2893.098770172287
Average MAE across 5 folds: 2178.8290147352245
Average R² Score across 5 folds: 0.6398542842263811
