In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import xgboost as xgb
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV,TimeSeriesSplit

In [2]:
def rmspe(y_true,y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    mask = y_true != True
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]

    if len(y_true_filtered) == 0:
        return float('inf')

    return np.sqrt(np.mean(np.square((y_true_filtered-y_pred_filtered)/y_true_filtered)))*100

In [3]:

paths = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if str(filename) == 'sample_submission.csv' or str(filename) == 'store.csv':
            if str(filename) == 'store.csv':
                store_data_path = Path(os.path.join(dirname, filename))
            continue
        print(os.path.join(dirname, filename))
        paths.append(Path(os.path.join(dirname, filename)))

/kaggle/input/rossmann-store-sales/train.csv
/kaggle/input/rossmann-store-sales/test.csv


In [4]:
dat = {}
for i in paths:
    with open(i,'r') as f:
        dat[i.stem] = pd.read_csv(f)

  dat[i.stem] = pd.read_csv(f)


In [5]:
with open(store_data_path,'r') as f:
    store_data = pd.read_csv(f)

In [6]:
for i in dat:
    dat[i]['Date'] = pd.to_datetime(dat[i]['Date'])
    dat[i]["Year"] = dat[i]["Date"].dt.year
    dat[i]["Month"] = dat[i]["Date"].dt.month
    dat[i]["Day"] = dat[i]["Date"].dt.day
    dat[i]["Weekday"] = dat[i]["Date"].dt.weekday
    dat[i].drop("Date", axis=1, inplace=True)
    dat[i].drop("StateHoliday", axis=1, inplace=True)

In [7]:
store_data = store_data.fillna(store_data.mean(numeric_only=True))
store_data = store_data.drop(['PromoInterval'],axis = 1)
for col in ['StoreType', 'Assortment']:
    store_data[col], _ = pd.factorize(store_data[col])

In [8]:
dat['train'] = pd.merge(dat['train'], store_data, on="Store", how="left")

In [9]:
X = dat['train'].drop(["Sales",'Customers'], axis=1)
y = dat['train']['Sales']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
rmse_scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)

model = xgb.XGBRegressor()

tscv = TimeSeriesSplit(n_splits=5)

In [11]:
# param_grid = {
#     'n_estimators': [100, 300, 500],
#     'max_depth': [3, 6, 10],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0],
#     'gamma': [0, 1, 5],
#     'reg_alpha': [0, 0.1, 1],
#     'reg_lambda': [1, 2, 5],
# }




param_grid = {'subsample': [0.6], 'reg_lambda': [5], 'reg_alpha': [0], 'n_estimators': [500], 'max_depth': [10], 'learning_rate': [0.1], 'gamma': [1], 'colsample_bytree': [1.0]}

search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(device='cuda'),
    param_distributions=param_grid,
    n_iter=100,
    scoring=rmse_scorer,
    cv=tscv,
    verbose=0,
    n_jobs=-1,
    random_state=42
)

In [12]:
best_features = ['Store', 'Open', 'Promo', 'SchoolHoliday', 'Year', 'Month', 'Day', 'Weekday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear']


In [13]:
dat['test'] = pd.merge(dat['test'], store_data, on="Store", how="left")

In [14]:
search.fit(X_train[best_features], y_train)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [15]:
X_test = dat['test']

submission = pd.DataFrame()

submission['Id'] = X_test['Id']

submission['Sales'] = search.predict(X_test[best_features]).clip(0)

submission.to_csv("submission.csv", index=False)


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


