In [2]:
import pandas as pd
import numpy as np

In [3]:
sales = pd.read_csv("sales.csv")
sales

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,Sales
0,0,625,3,2013-11-06,641,1,1,0,0,7293
1,1,293,2,2013-07-16,877,1,1,0,1,7060
2,2,39,4,2014-01-23,561,1,1,0,0,4565
3,3,676,4,2013-09-26,1584,1,1,0,0,6380
4,4,709,3,2014-01-22,1477,1,1,0,0,11647
...,...,...,...,...,...,...,...,...,...,...
640835,712040,674,6,2014-09-20,611,1,0,0,0,4702
640836,712041,1014,4,2015-01-15,1267,1,1,0,0,12545
640837,712042,135,6,2015-06-20,595,1,0,0,0,5823
640838,712043,810,1,2014-08-18,599,1,1,0,1,7986


In [4]:
sales.dtypes

True_index              int64
Store_ID                int64
Day_of_week             int64
Date                   object
Nb_customers_on_day     int64
Open                    int64
Promotion               int64
State_holiday          object
School_holiday          int64
Sales                   int64
dtype: object

In [5]:
import datetime

def convert_date(x):
    d = datetime.datetime.strptime(x, "%Y-%m-%d")
    return d.toordinal()

sales["Date"] = sales ["Date"].apply(convert_date)

In [6]:
sales

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,Sales
0,0,625,3,735178,641,1,1,0,0,7293
1,1,293,2,735065,877,1,1,0,1,7060
2,2,39,4,735256,561,1,1,0,0,4565
3,3,676,4,735137,1584,1,1,0,0,6380
4,4,709,3,735255,1477,1,1,0,0,11647
...,...,...,...,...,...,...,...,...,...,...
640835,712040,674,6,735496,611,1,0,0,0,4702
640836,712041,1014,4,735613,1267,1,1,0,0,12545
640837,712042,135,6,735769,595,1,0,0,0,5823
640838,712043,810,1,735463,599,1,1,0,1,7986


In [7]:
sales.dtypes

True_index              int64
Store_ID                int64
Day_of_week             int64
Date                    int64
Nb_customers_on_day     int64
Open                    int64
Promotion               int64
State_holiday          object
School_holiday          int64
Sales                   int64
dtype: object

In [8]:
sales["State_holiday"].unique()

array(['0', 'c', 'a', 'b'], dtype=object)

In [9]:
sales["State_holiday"].value_counts()

State_holiday
0    621160
a     12842
b      4214
c      2624
Name: count, dtype: int64

In [11]:
state_holiday = ['0', 'a', 'b', 'c']
for item in state_holiday:
    sales[item] = 0

In [12]:
for item2 in state_holiday: 
    sales[item2] = sales['State_holiday'].str.contains(item2, regex= False).astype(int)

In [13]:
sales

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday,Sales,0,a,b,c
0,0,625,3,735178,641,1,1,0,0,7293,1,0,0,0
1,1,293,2,735065,877,1,1,0,1,7060,1,0,0,0
2,2,39,4,735256,561,1,1,0,0,4565,1,0,0,0
3,3,676,4,735137,1584,1,1,0,0,6380,1,0,0,0
4,4,709,3,735255,1477,1,1,0,0,11647,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640835,712040,674,6,735496,611,1,0,0,0,4702,1,0,0,0
640836,712041,1014,4,735613,1267,1,1,0,0,12545,1,0,0,0
640837,712042,135,6,735769,595,1,0,0,0,5823,1,0,0,0
640838,712043,810,1,735463,599,1,1,0,1,7986,1,0,0,0


In [14]:
sales_clean = sales.drop("State_holiday", axis=1)

In [15]:
sales_clean.head()

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,School_holiday,Sales,0,a,b,c
0,0,625,3,735178,641,1,1,0,7293,1,0,0,0
1,1,293,2,735065,877,1,1,1,7060,1,0,0,0
2,2,39,4,735256,561,1,1,0,4565,1,0,0,0
3,3,676,4,735137,1584,1,1,0,6380,1,0,0,0
4,4,709,3,735255,1477,1,1,0,11647,1,0,0,0


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
features = sales_clean.drop(["True_index", "Sales"], axis=1)
target = sales_clean["Sales"]

X_train, X_test, y_train, y_test = train_test_split(features, target)

In [18]:
import xgboost

In [19]:
xgb_reg = xgboost.XGBRegressor(max_depth=5,
                              n_estimators=100)

xgb_reg.fit(X_train, y_train)
print(xgb_reg.score(X_test,y_test))
print(xgb_reg.score(X_train,y_train))

0.9481285747316704
0.9494726577493174


In [20]:
from sklearn.metrics import mean_squared_error

pred = xgb_reg.predict(X_test)

np.sqrt(mean_squared_error(y_test,pred))

877.1872004221112

In [23]:
from sklearn.model_selection import RandomizedSearchCV

In [21]:
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth
              }

In [24]:
xgb_reg = xgboost.XGBRegressor()
search = RandomizedSearchCV(estimator = xgb_reg, param_distributions = random_grid, n_iter = 10, cv = 5, n_jobs = 10)

In [25]:
search.fit(X_train,y_train)

46 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
41 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Bruno Santos\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Bruno Santos\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\Bruno Santos\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\sklearn.py", line 1025, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\Users\Bruno Santos\

In [26]:
search.best_params_

{'n_estimators': 1000, 'max_depth': 20}

In [27]:
xgb_reg = xgboost.XGBRegressor(max_depth=20,
                              n_estimators=1000)

In [28]:
xgb_reg.fit(X_train,y_train)

In [29]:
print(xgb_reg.score(X_test,y_test))
print(xgb_reg.score(X_train,y_train))

0.9745381920743242
0.9999999988028787


In [32]:
test = pd.concat([X_test, y_test], axis=1)

In [34]:
test = test[test["Open"]==1]

In [35]:
x_test_2 = test.drop(["Sales"], axis = 1)
y_test_2 = test["Sales"]

In [37]:
xgb_reg.score(x_test_2,y_test_2)

0.9530497030237627

In [36]:
pred = xgb_reg.predict(x_test_2)

np.sqrt(mean_squared_error(y_test_2,pred))

674.1196186650574