### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
import datetime as dt
from lightgbm import LGBMRegressor
import joblib

import warnings
warnings.filterwarnings('ignore')

### Load and preprocess the data

In [2]:
data=pd.read_csv('BigMart Sales Data.csv')

In [3]:
# gives the first 5 rows of the dataset
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
data.shape

(8523, 12)

In [5]:
# gives statistical information about numerical data
data.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [6]:
# Gives information about the entire dataet
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [7]:
# Returns number of missing/ null values in each column
data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [8]:
data.duplicated().sum()

0

In [9]:
# handle missing value using mean and mode
data['Item_Weight'].mean(), data['Outlet_Size'].mode()

(12.857645184135976,
 0    Medium
 Name: Outlet_Size, dtype: object)

In [10]:
# Replacing missing values with mean in Item_Weight column
data['Item_Weight'].fillna(data['Item_Weight'].mean(), inplace=True)

In [11]:
# Replacing missing values with mode in Outlet_size based on Outlet_Type 
pivot = data.pivot_table( values= 'Outlet_Size', columns= 'Outlet_Type', aggfunc=(lambda x:x.mode()[0]))

In [12]:
pivot

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


In [13]:
missing_val = data['Outlet_Size'].isnull()

In [14]:
missing_val

0       False
1       False
2       False
3        True
4       False
        ...  
8518    False
8519     True
8520    False
8521    False
8522    False
Name: Outlet_Size, Length: 8523, dtype: bool

In [15]:
data.loc[missing_val, 'Outlet_Size'] = data.loc[missing_val, 'Outlet_Type'].apply(lambda x:pivot[x])

In [16]:
data.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [17]:
categorical_columns=['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
unique_values = {col: data[col].value_counts() for col in categorical_columns}
unique_values

{'Item_Fat_Content': Item_Fat_Content
 Low Fat    5089
 Regular    2889
 LF          316
 reg         117
 low fat     112
 Name: count, dtype: int64,
 'Item_Type': Item_Type
 Fruits and Vegetables    1232
 Snack Foods              1200
 Household                 910
 Frozen Foods              856
 Dairy                     682
 Canned                    649
 Baking Goods              648
 Health and Hygiene        520
 Soft Drinks               445
 Meat                      425
 Breads                    251
 Hard Drinks               214
 Others                    169
 Starchy Foods             148
 Breakfast                 110
 Seafood                    64
 Name: count, dtype: int64,
 'Outlet_Size': Outlet_Size
 Small     4798
 Medium    2793
 High       932
 Name: count, dtype: int64,
 'Outlet_Location_Type': Outlet_Location_Type
 Tier 3    3350
 Tier 2    2785
 Tier 1    2388
 Name: count, dtype: int64,
 'Outlet_Type': Outlet_Type
 Supermarket Type1    5577
 Grocery Store      

In [18]:
data.replace({'Item_Fat_Content': {'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'}}, inplace=True)

In [19]:
data['Item_Fat_Content'].value_counts()

Item_Fat_Content
Low Fat    5517
Regular    3006
Name: count, dtype: int64

In [20]:
data['Item_Visibility'].value_counts()

Item_Visibility
0.000000    526
0.076975      3
0.162462      2
0.076841      2
0.073562      2
           ... 
0.013957      1
0.110460      1
0.124646      1
0.054142      1
0.044878      1
Name: count, Length: 7880, dtype: int64

In [21]:
# Replace zeros with the mean visibility for each item
data['Item_Visibility'].replace(0, np.nan, inplace=True)
data['Item_Visibility'] = data.groupby('Item_Identifier')['Item_Visibility'].transform(lambda x: x.fillna(x.mean()))

# If any zeros remain (for items with no visibility history), replace them with the overall average visibility
data['Item_Visibility'].fillna(data['Item_Visibility'].mean(), inplace=True)


In [22]:
data['Item_Visibility'].value_counts()

Item_Visibility
0.121880    4
0.027777    3
0.029511    3
0.081428    3
0.016164    3
           ..
0.046559    1
0.073525    1
0.014568    1
0.070646    1
0.044878    1
Name: count, Length: 8322, dtype: int64

In [23]:
curr_year = dt.datetime.today().year
curr_year

2024

In [24]:
data['Outlet_Age'] = curr_year - data['Outlet_Establishment_Year']
data['Outlet_Age']

0       25
1       15
2       25
3       26
4       37
        ..
8518    37
8519    22
8520    20
8521    15
8522    27
Name: Outlet_Age, Length: 8523, dtype: int64

In [25]:
data = data.drop('Outlet_Establishment_Year', axis=1)
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,Medium,Tier 1,Supermarket Type1,3735.1380,25
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,443.4228,15
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,Medium,Tier 1,Supermarket Type1,2097.2700,25
3,FDX07,19.200,Regular,0.022911,Fruits and Vegetables,182.0950,OUT010,Small,Tier 3,Grocery Store,732.3800,26
4,NCD19,8.930,Low Fat,0.016164,Household,53.8614,OUT013,High,Tier 3,Supermarket Type1,994.7052,37
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,High,Tier 3,Supermarket Type1,2778.3834,37
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,Small,Tier 2,Supermarket Type1,549.2850,22
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,Small,Tier 2,Supermarket Type1,1193.1136,20
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,Medium,Tier 3,Supermarket Type2,1845.5976,15


In [26]:
#One-Hot Encoding for categorical variables
categorical_columns = ['Item_Type', 'Item_Fat_Content']

# Perform One-Hot Encoding without dropping any category
data = pd.get_dummies(data, columns=categorical_columns, drop_first=False)

In [27]:
outlet_size_mapping = {'Small': 0,'Medium': 1,'High': 2}
data['Outlet_Size'] = data['Outlet_Size'].map(outlet_size_mapping)

location_type_mapping = {'Tier 1': 0,'Tier 2': 1,'Tier 3': 2}
data['Outlet_Location_Type'] = data['Outlet_Location_Type'].map(location_type_mapping)

outlet_type_mapping = {'Grocery Store': 0,'Supermarket Type1': 1,'Supermarket Type2': 2,'Supermarket Type3': 3}
data['Outlet_Type'] = data['Outlet_Type'].map(outlet_type_mapping)

In [28]:
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age,...,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular
0,FDA15,9.300,0.016047,249.8092,OUT049,1,0,1,3735.1380,25,...,False,False,False,False,False,False,False,False,True,False
1,DRC01,5.920,0.019278,48.2692,OUT018,1,2,2,443.4228,15,...,False,False,False,False,False,False,True,False,False,True
2,FDN15,17.500,0.016760,141.6180,OUT049,1,0,1,2097.2700,25,...,False,False,True,False,False,False,False,False,True,False
3,FDX07,19.200,0.022911,182.0950,OUT010,0,2,0,732.3800,26,...,False,False,False,False,False,False,False,False,False,True
4,NCD19,8.930,0.016164,53.8614,OUT013,2,2,1,994.7052,37,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,0.056783,214.5218,OUT013,2,2,1,2778.3834,37,...,False,False,False,False,False,True,False,False,True,False
8519,FDS36,8.380,0.046982,108.1570,OUT045,0,1,1,549.2850,22,...,False,False,False,False,False,False,False,False,False,True
8520,NCJ29,10.600,0.035186,85.1224,OUT035,0,1,1,1193.1136,20,...,True,False,False,False,False,False,False,False,True,False
8521,FDN46,7.210,0.145221,103.1332,OUT018,1,2,2,1845.5976,15,...,False,False,False,False,False,True,False,False,False,True


In [29]:
data.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1, inplace=True)

### Train Test Split

In [30]:
x=data.drop(columns='Item_Outlet_Sales', axis=1)
x

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,...,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular
0,9.300,0.016047,249.8092,1,0,1,25,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,5.920,0.019278,48.2692,1,2,2,15,False,False,False,...,False,False,False,False,False,False,True,False,False,True
2,17.500,0.016760,141.6180,1,0,1,25,False,False,False,...,False,False,True,False,False,False,False,False,True,False
3,19.200,0.022911,182.0950,0,2,0,26,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,8.930,0.016164,53.8614,2,2,1,37,False,False,False,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0.056783,214.5218,2,2,1,37,False,False,False,...,False,False,False,False,False,True,False,False,True,False
8519,8.380,0.046982,108.1570,0,1,1,22,True,False,False,...,False,False,False,False,False,False,False,False,False,True
8520,10.600,0.035186,85.1224,0,1,1,20,False,False,False,...,True,False,False,False,False,False,False,False,True,False
8521,7.210,0.145221,103.1332,1,2,2,15,False,False,False,...,False,False,False,False,False,True,False,False,False,True


In [31]:
y=data['Item_Outlet_Sales']
y

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [33]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(6818, 25) (1705, 25) (6818,) (1705,)


### Random Forest 

In [34]:
# Initialize RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

In [35]:
# Define a parameter grid for RandomForest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [36]:
# RandomizedSearch for RandomForest
random_search_rf = RandomizedSearchCV(
                                        estimator=rf, 
                                        param_distributions=param_grid_rf, 
                                        n_iter=10,             # Number of combinations to try
                                        cv=5,                  
                                        scoring='neg_mean_absolute_error', 
                                        n_jobs=-1, 
                                        verbose=2, 
                                        random_state=42
)

random_search_rf.fit(x_train, y_train)
print("Best parameters for RandomForest:", random_search_rf.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters for RandomForest: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 10}


In [37]:
# Retrain RandomForestRegressor with the best parameters
rf_optimized = RandomForestRegressor(
    n_estimators=200, 
    min_samples_split=5, 
    min_samples_leaf=4, 
    max_depth=10, 
    random_state=42
)

In [38]:
# Fit the optimized model on training data
rf_optimized.fit(x_train, y_train)

In [39]:
# Make predictions
y_pred_train_rf = rf_optimized.predict(x_train)
y_pred_test_rf = rf_optimized.predict(x_test)

In [40]:
# Evaluate the model (Mean Absolute Error & R2 Score)
print(f"Train MAE: {mean_absolute_error(y_train, y_pred_train_rf)}")
print(f"Train R2: {r2_score(y_train, y_pred_train_rf)}")
print(f"Test MAE: {mean_absolute_error(y_test, y_pred_test_rf)}")
print(f"Test R2: {r2_score(y_test, y_pred_test_rf)}")

Train MAE: 666.1531871649547
Train R2: 0.6950559867917658
Test MAE: 723.3465750922933
Test R2: 0.6064242509085416


### Light Gradient Boosting

In [41]:
# Initialize LGBMRegressor
lgb = LGBMRegressor(random_state=42)

In [42]:
# Define parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70],           # Controls complexity of the model
    'subsample': [0.7, 0.8, 0.9],         # Subsampling data for diversity
    'colsample_bytree': [0.7, 0.8, 0.9]   # Controls the features used in each tree
}

In [43]:
# RandomizedSearch for LightGBM Regressor
random_search_lgb = RandomizedSearchCV(
    estimator=lgb,
    param_distributions=param_grid_lgb,
    n_iter=20,                     # Number of parameter combinations to try
    scoring='neg_mean_absolute_error',
    cv=5,                          # 5-fold cross-validation
    n_jobs=-1,                     # Use all CPU cores
    verbose=2,
    random_state=42
)

random_search_lgb.fit(x_train, y_train)
print("Best parameters for LightGBM:", random_search_lgb.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 796
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 25
[LightGBM] [Info] Start training from score 2202.365232
Best parameters for LightGBM: {'subsample': 0.7, 'num_leaves': 31, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.7}


In [44]:
# Retrain LGBMRegressor with the best parameters
lgb_optimized = LGBMRegressor(
    subsample = 0.7, 
    num_leaves = 31, 
    n_estimators = 100, 
    max_depth = 10,
    learning_rate = 0.05,
    colsample_bytree = 0.7,
    random_state=42
)

In [45]:
# Fit the optimized model on training data
lgb_optimized.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 796
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 25
[LightGBM] [Info] Start training from score 2202.365232


In [46]:
# Make Predictions
y_pred_train_lgb = lgb_optimized.predict(x_train)
y_pred_test_lgb = lgb_optimized.predict(x_test)



In [47]:
# Evaluate the model (Mean Absolute Error & R2 Score)
print(f"Train MAE: {mean_absolute_error(y_train, y_pred_train_lgb)}")
print(f"Train R2: {r2_score(y_train, y_pred_train_lgb)}")
print(f"Test MAE: {mean_absolute_error(y_test, y_pred_test_lgb)}")
print(f"Test R2: {r2_score(y_test, y_pred_test_lgb)}")

Train MAE: 703.5634761498673
Train R2: 0.6701324916104383
Test MAE: 724.3134621796888
Test R2: 0.607247532712745


### Feature selection

In [48]:
# Set up cross-validation folds
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [49]:
# Perform RFECV
rfecv = RFECV(estimator=lgb_optimized, step=1, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
rfecv.fit(x_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 796
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 25
[LightGBM] [Info] Start training from score 2202.365232
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 794
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 24
[LightGBM] [Info] Start training from score 2202.365232
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

In [50]:
# Select the features chosen by RFECV
x_train_rfecv = x_train.loc[:, rfecv.support_]
x_test_rfecv = x_test.loc[:, rfecv.support_]

print("Optimal number of features after RFECV:", rfecv.n_features_)
print("Selected features:", x_train_rfecv.columns)

Optimal number of features after RFECV: 12
Selected features: Index(['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Size',
       'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Age', 'Item_Type_Dairy',
       'Item_Type_Fruits and Vegetables', 'Item_Type_Snack Foods',
       'Item_Type_Soft Drinks', 'Item_Fat_Content_Low Fat'],
      dtype='object')


In [51]:
# Filter training and testing sets to only include the selected features
x_train_rfecv = x_train[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Size','Outlet_Location_Type', 
                         'Outlet_Type', 'Outlet_Age', 'Item_Type_Dairy', 'Item_Type_Fruits and Vegetables', 
                         'Item_Type_Snack Foods', 'Item_Type_Soft Drinks', 'Item_Fat_Content_Low Fat']]
x_test_rfecv = x_test[x_train_rfecv.columns]

In [52]:
lgb_final = LGBMRegressor(
    subsample = 0.7, 
    num_leaves = 45, 
    n_estimators = 100, 
    max_depth = 10,
    learning_rate = 0.05,
    colsample_bytree = 0.7,
    random_state=42
)
lgb_final.fit(x_train_rfecv, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 770
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 12
[LightGBM] [Info] Start training from score 2202.365232


In [53]:
# Predictions
y_pred_train_rfecv = lgb_final.predict(x_train_rfecv)
y_pred_test_rfecv = lgb_final.predict(x_test_rfecv)

In [54]:
print(f"Train MAE with Selected Features: {mean_absolute_error(y_train, y_pred_train_rfecv)}")
print(f"Train R² with Selected Features: {r2_score(y_train, y_pred_train_rfecv)}")
print(f"Test MAE with Selected Features: {mean_absolute_error(y_test, y_pred_test_rfecv)}")
print(f"Test R² with Selected Features: {r2_score(y_test, y_pred_test_rfecv)}")

Train MAE with Selected Features: 687.4682519145123
Train R² with Selected Features: 0.6873375141455436
Test MAE with Selected Features: 722.3502475889497
Test R² with Selected Features: 0.604471822543424


### Prediction on unseen data

In [55]:
# selecting lgb_final as the best model and using it to make predictions on unseen data
pred = lgb_final.predict(np.array([[12.3,0.05,100.45,1,1,1,20,1,2,2,1,0]]))
print(f"Sales value is between {pred[0]-722.35:.2f} and {pred[0]+722.35:.2f}")

Sales value is between 826.42 and 2271.12


In [56]:
# Save the model
joblib.dump(lgb_final, 'lightgbm_model.pkl')
print("Model saved as 'lightgbm_model.pkl'")

Model saved as 'lightgbm_model.pkl'
