In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [3]:
import sklearn
print(sklearn.__version__)

1.2.1


In [None]:
!pip install xgboost

: 

In [None]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [None]:
df_train.head()

In [None]:
df_train.isnull().sum()

In [None]:
df_train['Item_Weight'].fillna(df_train['Item_Weight'].mean(), inplace=True)

In [None]:
df_train['Outlet_Size'].fillna(df_train['Outlet_Size'].mode()[0], inplace=True)

In [None]:
df_train.isnull().sum()


In [None]:
df_train['Item_Fat_Content'] = df_train['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'})


In [None]:
df_train['Outlet_Age'] = 2024 - df_train['Outlet_Establishment_Year']

# Drop the original 'Outlet_Establishment_Year' as it's now redundant
df_train.drop('Outlet_Establishment_Year', axis=1, inplace=True)

In [None]:
df_train['Outlet_Location_Type*Outlet_Type'] = df_train['Outlet_Location_Type'] + "*" + df_train['Outlet_Type']
df_train['Outlet_Location_Type*Item_Type'] = df_train['Outlet_Location_Type'] + "*" + df_train['Item_Type']

In [None]:
# Replace zeros with mean visibility of that product
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
visibility_avg = df_train.pivot_table(values='Item_Visibility', index='Item_Identifier')
df_train.loc[df_train['Item_Visibility'] == 0, 'Item_Visibility'] = df_train.loc[df_train['Item_Visibility'] == 0, 'Item_Identifier'].apply(lambda x: visibility_avg.at[x, 'Item_Visibility'])
# df_train['Outlet_Location_Type']= le.fit_transform(df_train['Outlet_Location_Type'])
# df_train['Outlet_Type']= le.fit_transform(df_train['Outlet_Type'])
# df_train['Item_Type']= le.fit_transform(df_train['Item_Type'])
# df_train['Item_Fat_Content']= le.fit_transform(df_train['Item_Fat_Content'])
# df_train['Outlet_Size']= le.fit_transform(df_train['Outlet_Size'])

In [None]:
df_train.head()

In [None]:
numeric_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Age']
categorical_features = ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Type', 'Outlet_Location_Type*Outlet_Type', 'Outlet_Location_Type*Item_Type']


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ])

In [None]:
preprocessor

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
# Feature selection using SelectKBest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=f_regression, k=20)),
    ('regressor', XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.01, max_depth=3, subsample=0.9, colsample_bytree=0.9, random_state=0))
])

In [None]:
pipeline

In [None]:
y = df_train['Item_Outlet_Sales']
X = df_train.drop('Item_Outlet_Sales', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [None]:
pipeline.fit(X_train, y_train)
print("Model accuracy on test set:", pipeline.score(X_test, y_test))

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

In [None]:
joblib.dump(pipeline, 'advanced_sales_prediction_pipeline.pkl')

In [None]:
df_test['Outlet_Age'] = 2024 - df_test['Outlet_Establishment_Year']

In [None]:
df_test.drop(['Outlet_Establishment_Year'], axis=1, inplace=True)

In [None]:
df_test.isnull().sum()

In [None]:
df_test['Item_Weight'].fillna(df_test['Item_Weight'].mean(), inplace=True)
df_test['Outlet_Size'].fillna(df_test['Outlet_Size'].mode()[0], inplace=True)


In [None]:
df_test.isnull().sum()

In [None]:
df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'})

In [None]:
df_test['Outlet_Location_Type*Outlet_Type'] = df_test['Outlet_Location_Type'] + "*" + df_test['Outlet_Type']
df_test['Outlet_Location_Type*Item_Type'] = df_test['Outlet_Location_Type'] + "*" + df_test['Item_Type']

In [None]:
visibility_avg = df_test.pivot_table(values='Item_Visibility', index='Item_Identifier')
df_test.loc[df_test['Item_Visibility'] == 0, 'Item_Visibility'] = df_test.loc[df_test['Item_Visibility'] == 0, 'Item_Identifier'].apply(lambda x: visibility_avg.at[x, 'Item_Visibility'])

In [None]:
df_test.head()

In [None]:
pipeline = joblib.load('advanced_sales_prediction_pipeline.pkl')

In [None]:
predictions = pipeline.predict(df_test)

In [None]:
df_test['Item_Outlet_Sales'] = predictions


In [None]:
result = df_test[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']]
result.to_excel('predicted_sales.xlsx', index=False)