In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np

In [12]:
# Load data
data = pd.read_csv('final_data_for_hypothesis.csv')

In [13]:
data

Unnamed: 0,Date,ID,Store_id,Store_Type,Location_Type,Region_Code,Holiday,Discount,Order,Sales,Month,Weekday,Year
0,2018-01-01,T1000001,1,S1,L3,R1,1,Yes,9,7011.84,1,0,2018
1,2018-01-01,T1000002,253,S4,L2,R1,1,Yes,60,51789.12,1,0,2018
2,2018-01-01,T1000003,252,S3,L2,R1,1,Yes,42,36868.20,1,0,2018
3,2018-01-01,T1000004,251,S2,L3,R1,1,Yes,23,19715.16,1,0,2018
4,2018-01-01,T1000005,250,S2,L3,R4,1,Yes,62,45614.52,1,0,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...
186452,2019-05-31,T1188336,149,S2,L3,R2,1,Yes,51,37272.00,5,4,2019
186453,2019-05-31,T1188337,153,S4,L2,R1,1,No,90,54572.64,5,4,2019
186454,2019-05-31,T1188338,154,S1,L3,R2,1,No,56,31624.56,5,4,2019
186455,2019-05-31,T1188339,155,S3,L1,R2,1,Yes,70,49162.41,5,4,2019


In [17]:
data['Discount'] = data['Discount'].apply(lambda x: 1 if x == 'Yes' else 0)

In [18]:
# Feature Engineering: Add time-based features
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Date'].dt.month
data['Week'] = data['Date'].dt.isocalendar().week
data['Day'] = data['Date'].dt.day
data['DayOfWeek'] = data['Date'].dt.dayofweek

In [25]:
store_type_mean_target = round(data.groupby('Store_Type')['Sales'].mean(),2)
location_type_mean_target = round(data.groupby('Location_Type')['Sales'].mean(),2)
region_type_mean_target = round(data.groupby('Region_Code')['Sales'].mean(),2)

data['Store_Type_Encoded'] = data['Store_Type'].map(store_type_mean_target)
data['Region_Code_Encoded'] = data['Region_Code'].map(region_type_mean_target)
data['Location_Type_Encoded'] = data['Location_Type'].map(location_type_mean_target)

In [27]:
data_copy1=data.copy()

In [28]:
data.drop(columns=['Store_Type','Location_Type','Region_Code'],inplace=True)

In [32]:
data.drop(columns=['ID','Date'],inplace=True)

In [33]:
data

Unnamed: 0,Store_id,Holiday,Discount,Order,Sales,Month,Weekday,Year,Week,Day,DayOfWeek,Store_Type_Encoded,Region_Code_Encoded,Location_Type_Encoded
0,1,1,0,9,7011.84,1,0,2018,1,1,0,37638.07,45317.28,33064.27
1,253,1,0,60,51789.12,1,0,2018,1,1,0,57605.41,45317.28,56969.87
2,252,1,0,42,36868.20,1,0,2018,1,1,0,46822.39,45317.28,56969.87
3,251,1,0,23,19715.16,1,0,2018,1,1,0,27530.83,45317.28,33064.27
4,250,1,0,62,45614.52,1,0,2018,1,1,0,27530.83,39428.18,33064.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186452,149,1,0,51,37272.00,5,4,2019,22,31,4,27530.83,39652.14,33064.27
186453,153,1,0,90,54572.64,5,4,2019,22,31,4,57605.41,45317.28,56969.87
186454,154,1,0,56,31624.56,5,4,2019,22,31,4,37638.07,39652.14,33064.27
186455,155,1,0,70,49162.41,5,4,2019,22,31,4,46822.39,39652.14,41358.31


In [39]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the data
scaled_data = pd.DataFrame(scaler.fit_transform(data),columns=data.columns)

In [41]:
X=scaled_data.drop(columns='Sales')
y=scaled_data[['Sales']]

Unnamed: 0,Sales
0,-2.083227
1,0.581478
2,-0.306469
3,-1.327250
4,0.214027
...,...
186452,-0.282438
186453,0.747126
186454,-0.618519
186455,0.425162


In [44]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
model= XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

In [46]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'MAPE: {mape}')


MAE: 0.11069360263152286
MSE: 0.022495284885267742
RMSE: 0.1499842821273874
MAPE: 2.850406779323467
