In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("Train.csv")

In [3]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [4]:
df.shape

(8523, 12)

In [5]:
df['Item_Weight'].fillna(df['Item_Weight'].mean(),inplace = True)

In [6]:
df['Item_Weight'] = df['Item_Weight'].round(2)

In [7]:
mode_of_Outlet_size = df.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

In [8]:
miss_values = df['Outlet_Size'].isnull()  

In [9]:
df.loc[miss_values, 'Outlet_Size'] = df.loc[miss_values,'Outlet_Type'].apply(lambda x: mode_of_Outlet_size[x])

In [10]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'lf': 'Low Fat', 'reg': 'Regular','LF': 'Low Fat', 'low fat': 'Low Fat'})

In [11]:
df['Item_Visibility'] = df['Item_Visibility'].round(1)

In [12]:
df['Item_Visibility'] = np.select(
    [df['Item_Visibility'] == 0.0, df['Item_Visibility'] == 0.1, df['Item_Visibility'] > 0.1],
    ['Low', 'Medium', 'High'])

In [13]:
df.drop(columns = ['Item_Identifier','Outlet_Identifier'], inplace = True)

In [14]:
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,Low,Dairy,249.8092,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,5.92,Regular,Low,Soft Drinks,48.2692,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,17.5,Low Fat,Low,Meat,141.618,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,19.2,Regular,Low,Fruits and Vegetables,182.095,1998,Small,Tier 3,Grocery Store,732.38
4,8.93,Low Fat,Low,Household,53.8614,1987,High,Tier 3,Supermarket Type1,994.7052


In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [16]:
df['Item_Visibility'] = label_encoder.fit_transform(df['Item_Visibility'])
df['Outlet_Location_Type'] = label_encoder.fit_transform(df['Outlet_Location_Type'])
df['Outlet_Size'] = label_encoder.fit_transform(df['Outlet_Size'])

In [17]:
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,9.3,Low Fat,1,Dairy,249.8092,1999,1,0,Supermarket Type1,3735.138
1,5.92,Regular,1,Soft Drinks,48.2692,2009,1,2,Supermarket Type2,443.4228
2,17.5,Low Fat,1,Meat,141.618,1999,1,0,Supermarket Type1,2097.27
3,19.2,Regular,1,Fruits and Vegetables,182.095,1998,2,2,Grocery Store,732.38
4,8.93,Low Fat,1,Household,53.8614,1987,0,2,Supermarket Type1,994.7052


In [18]:
X = df.drop(columns=['Item_Outlet_Sales'])
y = np.log(df['Item_Outlet_Sales'])

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=2)

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score,mean_absolute_error,accuracy_score

In [21]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
from sklearn.model_selection import RandomizedSearchCV,RepeatedKFold,KFold
from scipy.stats import uniform, randint

In [24]:
step1 = ColumnTransformer(transformers=[
    ('col_tnf',OneHotEncoder(sparse=False,drop='first'),[1,3,8]),('scaler',StandardScaler(),[0,4])
],remainder='passthrough')

step2 = XGBRegressor()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])
param_distributions= {
    'step2__learning_rate': [0.1],
    'step2__n_estimators': [300],
    'step2__max_depth': [2],
    'step2__min_child_weight': [10],
    'step2__subsample': [0.8],
    'step2__colsample_bytree': [0.3],
    'step2__alpha': uniform(loc=0, scale=0.5),  # Example alpha values from 0 to 2
    'step2__lambda': uniform(loc=0, scale=0.5)
}
n_splits = 5

# Use KFold for cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Use RandomizedSearchCV for hyperparameter tuning with K-Fold Cross-Validation
random_search = RandomizedSearchCV(estimator=pipe, param_distributions=param_distributions,
                                   n_iter=10, scoring='neg_mean_absolute_error', cv=kf, verbose=1, random_state=42)


random_search.fit(X_train, y_train)

# Print the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model from the grid search
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model performance
mae = mean_absolute_error(y_test, y_pred)

print('R2 score',r2_score(y_test,y_pred))
print(f'Mean Absolute Error on Test Set: {mae}')
print('\n' + '-'*40 + '\n')

Fitting 5 folds for each of 10 candidates, totalling 50 fits






Best Hyperparameters: {'step2__alpha': 0.41622132040021087, 'step2__colsample_bytree': 0.3, 'step2__lambda': 0.10616955533913808, 'step2__learning_rate': 0.1, 'step2__max_depth': 2, 'step2__min_child_weight': 10, 'step2__n_estimators': 300, 'step2__subsample': 0.8}
R2 score 0.7233794781815059
Mean Absolute Error on Test Set: 0.41624504199413215

----------------------------------------



In [25]:
import pickle

pickle.dump(df,open('df.pkl','wb'))


In [26]:
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)