In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
!pip install xgboost




[notice] A new release of pip is available: 23.0.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [4]:
df_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
df_train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
df_train['Item_Weight'].fillna(df_train['Item_Weight'].mean(), inplace=True)

In [7]:
df_train['Outlet_Size'].fillna(df_train['Outlet_Size'].mode()[0], inplace=True)

In [8]:
df_train.isnull().sum()


Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [9]:
df_train['Item_Fat_Content'] = df_train['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'})


In [10]:
df_train['Outlet_Age'] = 2024 - df_train['Outlet_Establishment_Year']

# Drop the original 'Outlet_Establishment_Year' as it's now redundant
df_train.drop('Outlet_Establishment_Year', axis=1, inplace=True)

In [11]:
df_train['Outlet_Location_Type*Outlet_Type'] = df_train['Outlet_Location_Type'] + "*" + df_train['Outlet_Type']
df_train['Outlet_Location_Type*Item_Type'] = df_train['Outlet_Location_Type'] + "*" + df_train['Item_Type']

In [12]:
# Replace zeros with mean visibility of that product
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
visibility_avg = df_train.pivot_table(values='Item_Visibility', index='Item_Identifier')
df_train.loc[df_train['Item_Visibility'] == 0, 'Item_Visibility'] = df_train.loc[df_train['Item_Visibility'] == 0, 'Item_Identifier'].apply(lambda x: visibility_avg.at[x, 'Item_Visibility'])
# df_train['Outlet_Location_Type']= le.fit_transform(df_train['Outlet_Location_Type'])
# df_train['Outlet_Type']= le.fit_transform(df_train['Outlet_Type'])
# df_train['Item_Type']= le.fit_transform(df_train['Item_Type'])
# df_train['Item_Fat_Content']= le.fit_transform(df_train['Item_Fat_Content'])
# df_train['Outlet_Size']= le.fit_transform(df_train['Outlet_Size'])

In [13]:
df_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age,Outlet_Location_Type*Outlet_Type,Outlet_Location_Type*Item_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,Medium,Tier 1,Supermarket Type1,3735.138,25,Tier 1*Supermarket Type1,Tier 1*Dairy
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,443.4228,15,Tier 3*Supermarket Type2,Tier 3*Soft Drinks
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,Medium,Tier 1,Supermarket Type1,2097.27,25,Tier 1*Supermarket Type1,Tier 1*Meat
3,FDX07,19.2,Regular,0.015274,Fruits and Vegetables,182.095,OUT010,Medium,Tier 3,Grocery Store,732.38,26,Tier 3*Grocery Store,Tier 3*Fruits and Vegetables
4,NCD19,8.93,Low Fat,0.008082,Household,53.8614,OUT013,High,Tier 3,Supermarket Type1,994.7052,37,Tier 3*Supermarket Type1,Tier 3*Household


In [14]:
numeric_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age']
categorical_features = ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Type', 'Outlet_Location_Type*Outlet_Type', 'Outlet_Location_Type*Item_Type']


In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ])

In [16]:
preprocessor

In [17]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
# Feature selection using SelectKBest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.01, max_depth=3, subsample=0.9, colsample_bytree=0.9, random_state=0))
])

In [18]:
pipeline

In [19]:
y = df_train['Item_Outlet_Sales']
X = df_train.drop('Item_Outlet_Sales', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [20]:
pipeline.fit(X_train, y_train)
print("Model accuracy on test set:", pipeline.score(X_test, y_test))

Model accuracy on test set: 0.5970036412900512


In [21]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__n_estimators': 500}
Best cross-validation score: 0.60


In [22]:
joblib.dump(pipeline, 'advanced_sales_prediction_pipeline.pkl')

['advanced_sales_prediction_pipeline.pkl']

In [23]:
df_test['Outlet_Age'] = 2024 - df_test['Outlet_Establishment_Year']

In [24]:
df_test.drop(['Outlet_Establishment_Year'], axis=1, inplace=True)

In [26]:
df_test.isnull().sum()

Item_Identifier            0
Item_Weight              976
Item_Fat_Content           0
Item_Visibility            0
Item_Type                  0
Item_MRP                   0
Outlet_Identifier          0
Outlet_Size             1606
Outlet_Location_Type       0
Outlet_Type                0
Outlet_Age                 0
dtype: int64

In [27]:
df_test['Item_Weight'].fillna(df_test['Item_Weight'].mean(), inplace=True)
df_test['Outlet_Size'].fillna(df_test['Outlet_Size'].mode()[0], inplace=True)


In [28]:
df_test.isnull().sum()

Item_Identifier         0
Item_Weight             0
Item_Fat_Content        0
Item_Visibility         0
Item_Type               0
Item_MRP                0
Outlet_Identifier       0
Outlet_Size             0
Outlet_Location_Type    0
Outlet_Type             0
Outlet_Age              0
dtype: int64

In [30]:
df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'})

In [31]:
df_test['Outlet_Location_Type*Outlet_Type'] = df_test['Outlet_Location_Type'] + "*" + df_test['Outlet_Type']
df_test['Outlet_Location_Type*Item_Type'] = df_test['Outlet_Location_Type'] + "*" + df_test['Item_Type']

In [32]:
visibility_avg = df_test.pivot_table(values='Item_Visibility', index='Item_Identifier')
df_test.loc[df_test['Item_Visibility'] == 0, 'Item_Visibility'] = df_test.loc[df_test['Item_Visibility'] == 0, 'Item_Identifier'].apply(lambda x: visibility_avg.at[x, 'Item_Visibility'])

In [33]:
df_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age,Outlet_Location_Type*Outlet_Type,Outlet_Location_Type*Item_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,Medium,Tier 1,Supermarket Type1,25,Tier 1*Supermarket Type1,Tier 1*Snack Foods
1,FDW14,8.3,Regular,0.038428,Dairy,87.3198,OUT017,Medium,Tier 2,Supermarket Type1,17,Tier 2*Supermarket Type1,Tier 2*Dairy
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,Medium,Tier 3,Grocery Store,26,Tier 3*Grocery Store,Tier 3*Others
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,Medium,Tier 2,Supermarket Type1,17,Tier 2*Supermarket Type1,Tier 2*Snack Foods
4,FDY38,12.695633,Regular,0.118599,Dairy,234.23,OUT027,Medium,Tier 3,Supermarket Type3,39,Tier 3*Supermarket Type3,Tier 3*Dairy


In [34]:
pipeline = joblib.load('advanced_sales_prediction_pipeline.pkl')

In [35]:
predictions = pipeline.predict(df_test)

In [39]:
df_test['Item_Outlet_Sales'] = predictions


In [None]:
result = df_test[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]
result.to_excel('predicted_sales.xlsx', index=False)