In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [2]:
sales_data = pd.read_csv('Train.csv')

In [3]:
sales_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
sales_data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
sales_data.shape

(8523, 12)

In [6]:
sales_data['Outlet_Size'].value_counts()

Medium    2793
Small     2388
High       932
Name: Outlet_Size, dtype: int64

In [7]:
sales_data['Item_Fat_Content'].value_counts()

Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: Item_Fat_Content, dtype: int64

##### Imputating Item_Weight column to replace missing null values with mean

In [8]:
#Item weight has a lot of null values
#imputing the average item weights of the entire column and add that into the item weight column for missing records
sales_data['Item_Weight'] = sales_data['Item_Weight'].fillna(sales_data['Item_Weight'].mean()) 

In [9]:
sales_data.isnull().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

##### Imputating Outlet_Size column to replace missing null values with mode

In [10]:
mode_of_outlet_size = sales_data.pivot_table(values='Outlet_Size', columns = 'Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

In [11]:
mode_of_outlet_size #what this is showing is if outlet type is grocery store, most of the outlet sizes are small

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


In [12]:
missing_values = sales_data['Outlet_Size'].isnull()

In [13]:
missing_values

0       False
1       False
2       False
3        True
4       False
        ...  
8518    False
8519     True
8520    False
8521    False
8522    False
Name: Outlet_Size, Length: 8523, dtype: bool

In [14]:
sales_data.loc[missing_values, 'Outlet_Size'] = sales_data.loc[missing_values, 'Outlet_Type'].apply(lambda x: mode_of_outlet_size)

In [15]:
sales_data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [16]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [17]:
sales_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Outlet_Type Grocery Store Supermarket Type1 Su...,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


##### Replacing Categorial data with numbers

In [18]:
sales_data.replace({'Item_Fat_Content':{'low fat':'Low Fat','LF':'Low Fat','reg':'Regular'}}, inplace = True)

In [19]:
sales_data['Item_Identifier'].dtype

dtype('O')

In [20]:
sales_data['Item_Identifier'] = sales_data['Item_Identifier'].astype(str)
sales_data['Item_Fat_Content'] = sales_data['Item_Fat_Content'].astype(str)
sales_data['Item_Type'] = sales_data['Item_Type'].astype(str)
sales_data['Outlet_Identifier'] = sales_data['Outlet_Identifier'].astype(str)
sales_data['Outlet_Size'] = sales_data['Outlet_Size'].astype(str)
sales_data['Outlet_Location_Type'] = sales_data['Outlet_Location_Type'].astype(str)
sales_data['Outlet_Type'] = sales_data['Outlet_Type'].astype(str)

In [21]:
encoder = LabelEncoder()

In [22]:
sales_data['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [23]:
sales_data['Item_Identifier'] = encoder.fit_transform(sales_data['Item_Identifier'])
sales_data['Item_Fat_Content'] = encoder.fit_transform(sales_data['Item_Fat_Content'])
sales_data['Item_Type'] = encoder.fit_transform(sales_data['Item_Type'])
sales_data['Outlet_Identifier'] = encoder.fit_transform(sales_data['Outlet_Identifier'])
sales_data['Outlet_Size'] = encoder.fit_transform(sales_data['Outlet_Size'])
sales_data['Outlet_Location_Type'] = encoder.fit_transform(sales_data['Outlet_Location_Type'])
sales_data['Outlet_Type'] = encoder.fit_transform(sales_data['Outlet_Type'])

In [24]:
sales_data['Item_Identifier'].value_counts()

1077    10
413     10
1542     9
301      9
35       9
        ..
1200     1
819      1
713      1
46       1
251      1
Name: Item_Identifier, Length: 1559, dtype: int64

In [25]:
sales_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,156,9.3,0,0.016047,4,249.8092,9,1999,1,0,1,3735.138
1,8,5.92,1,0.019278,14,48.2692,3,2009,1,2,2,443.4228
2,662,17.5,0,0.01676,10,141.618,9,1999,1,0,1,2097.27
3,1121,19.2,1,0.0,6,182.095,0,1998,2,2,0,732.38
4,1297,8.93,0,0.0,9,53.8614,1,1987,0,2,1,994.7052


In [26]:
sales_data.shape

(8523, 12)

##### Splitting features and Target

In [27]:
X = sales_data.drop(columns = 'Item_Outlet_Sales', axis = 1)
Y = sales_data['Item_Outlet_Sales']
print(X,Y)

      Item_Identifier  Item_Weight  Item_Fat_Content  Item_Visibility  \
0                 156        9.300                 0         0.016047   
1                   8        5.920                 1         0.019278   
2                 662       17.500                 0         0.016760   
3                1121       19.200                 1         0.000000   
4                1297        8.930                 0         0.000000   
...               ...          ...               ...              ...   
8518              370        6.865                 0         0.056783   
8519              897        8.380                 1         0.046982   
8520             1357       10.600                 0         0.035186   
8521              681        7.210                 1         0.145221   
8522               50       14.800                 0         0.044878   

      Item_Type  Item_MRP  Outlet_Identifier  Outlet_Establishment_Year  \
0             4  249.8092                  9    

##### Standardizing data

In [28]:
scaler = StandardScaler()  
scaler.fit(X)
standardized_data = scaler.transform(X)

In [29]:
X = standardized_data
Y = sales_data['Item_Outlet_Sales']

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2)

###### RandomSearchCV and GridSearchCV

In [31]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define parameter grids for randomized search (coarse search)
linear_param_grid_coarse = {
    'fit_intercept': [True, False],
    'normalize': [True, False]
}

xgboost_param_grid_coarse = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

svm_param_grid_coarse = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# RandomizedSearchCV for each model (coarse search)
linear_random_search_coarse = RandomizedSearchCV(LinearRegression(), linear_param_grid_coarse, n_iter=10, cv=5, n_jobs=-1)
xgboost_random_search_coarse = RandomizedSearchCV(XGBRegressor(), xgboost_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
svm_random_search_coarse = RandomizedSearchCV(SVR(), svm_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)

# Fit models using RandomizedSearchCV (coarse search)
linear_random_search_coarse.fit(X_train, Y_train)
xgboost_random_search_coarse.fit(X_train, Y_train)
svm_random_search_coarse.fit(X_train, Y_train)

# Get best hyperparameters from RandomizedSearchCV (coarse search)
best_linear_params_coarse = linear_random_search_coarse.best_params_
best_xgboost_params_coarse = xgboost_random_search_coarse.best_params_
best_svm_params_coarse = svm_random_search_coarse.best_params_

# Define parameter grids for GridSearchCV (fine search)
linear_param_grid_fine = {
    'fit_intercept': [best_linear_params_coarse['fit_intercept']],
    'normalize': [best_linear_params_coarse['normalize']]
}

xgboost_param_grid_fine = {
    'learning_rate': [best_xgboost_params_coarse['learning_rate'] * i for i in [0.5, 1, 2]],
    'n_estimators': [best_xgboost_params_coarse['n_estimators']],
    'max_depth': [best_xgboost_params_coarse['max_depth']],
    'min_child_weight': [best_xgboost_params_coarse['min_child_weight']],
    'subsample': [best_xgboost_params_coarse['subsample']],
    'colsample_bytree': [best_xgboost_params_coarse['colsample_bytree']]
}

svm_param_grid_fine = {
    'C': [best_svm_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'epsilon': [best_svm_params_coarse['epsilon'] * i for i in [0.1, 1, 10]],
    'kernel': [best_svm_params_coarse['kernel']]
}

# GridSearchCV for each model (fine search)
linear_grid_search_fine = GridSearchCV(LinearRegression(), param_grid=linear_param_grid_fine, cv=5, n_jobs=-1)
xgboost_grid_search_fine = GridSearchCV(XGBRegressor(), param_grid=xgboost_param_grid_fine, cv=5, n_jobs=-1)
svm_grid_search_fine = GridSearchCV(SVR(), param_grid=svm_param_grid_fine, cv=5, n_jobs=-1)

# Fit models using GridSearchCV (fine search)
linear_grid_search_fine.fit(X_train, Y_train)
xgboost_grid_search_fine.fit(X_train, Y_train)
svm_grid_search_fine.fit(X_train, Y_train)

# Print best hyperparameters from GridSearchCV (fine search)
print("Linear Regression Best Parameters (Fine Search):", linear_grid_search_fine.best_params_)
print("XGBoost Regressor Best Parameters (Fine Search):", xgboost_grid_search_fine.best_params_)
print("SVR Best Parameters (Fine Search):", svm_grid_search_fine.best_params_)

# Compare cross-validated scores of each model
linear_cv_score_fine = linear_grid_search_fine.best_score_
xgboost_cv_score_fine = xgboost_grid_search_fine.best_score_
svm_cv_score_fine = svm_grid_search_fine.best_score_

# Select the best model based on cross-validated scores
best_model_fine = None
if linear_cv_score_fine >= xgboost_cv_score_fine and linear_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = linear_grid_search_fine.best_estimator_
elif xgboost_cv_score_fine >= linear_cv_score_fine and xgboost_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = xgboost_grid_search_fine.best_estimator_
else:
    best_model_fine = svm_grid_search_fine.best_estimator_

# Evaluate the best model on the test set
train_accuracy_fine = best_model_fine.score(X_train, Y_train)
print("Best Model Train Accuracy (Fine Search):", train_accuracy_fine)
test_accuracy_fine = best_model_fine.score(X_test, Y_test)
print("Best Model Test Accuracy (Fine Search):", test_accuracy_fine)



If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




Linear Regression Best Parameters (Fine Search): {'fit_intercept': True, 'normalize': True}
XGBoost Regressor Best Parameters (Fine Search): {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'n_estimators': 50, 'subsample': 0.8}
SVR Best Parameters (Fine Search): {'C': 100, 'epsilon': 0.001, 'kernel': 'linear'}
Best Model Train Accuracy (Fine Search): 0.6131106859018131
Best Model Test Accuracy (Fine Search): 0.6200779623084078


In [32]:
test_pred = best_model_fine.predict(X_test)
r_square_test = metrics.r2_score(Y_test, test_pred)
mse_test = mean_squared_error(Y_test, test_pred)
print("R_Squared is:", r_square_test)
print("Mean Squared Error:", mse_test)

R_Squared is: 0.6200779623084078
Mean Squared Error: 1128098.7884280866


##### Best model training using GridSearchCV only on XGBRegressor

In [29]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X,Y,test_size = .1)

In [30]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300]
}

In [31]:
xgb_model = XGBRegressor()
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_Train, Y_Train)

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, device=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    monotone_constraints=None,
                                    multi_strategy=None, n_estimators=None,
                                    n_jobs=None, num_parallel_tree=None,
                                    random_state=None, ...),
             n_jobs

In [32]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 1.0}


In [36]:
best_model = grid_search.best_estimator_

In [39]:
train_pred = best_model.predict(X_Train)
r_square_train = metrics.r2_score(Y_Train, train_pred)
mse_train = mean_squared_error(Y_Train, train_pred)
print("R_Squared is:", r_square_train)
print("Mean Squared Error:", mse_train)

R_Squared is: 0.6247059761109411
Mean Squared Error: 1095453.648730159


In [34]:
y_pred = best_model.predict(X_Test)

In [35]:
r_square = metrics.r2_score(Y_Test, y_pred)
mse = mean_squared_error(Y_Test, y_pred)
print("R_Squared is:", r_square)
print("Mean Squared Error:", mse)

R_Squared is: 0.6335460148786344
Mean Squared Error: 1042530.1649205281
