In [1]:
import sys
sys.path.insert(1, '../Src')
import numpy as np
import pandas as pd
import os
from utils.feature_engineering import *
from utils.preprocessing import *

#PATH_DATASET = '../Dataset/'
PATH_DATASET = '../Dataset/data-h-m1-challenge-final'
PATH_RESULTS = '../Results/'

# loading dataset

In [2]:
df_sales  = pd.read_pickle(os.path.join(PATH_RESULTS,'dataset','sales.pkl'))
df_items  = pd.read_pickle(os.path.join(PATH_RESULTS,'dataset','item.pkl'))
df_stores = pd.read_pickle(os.path.join(PATH_RESULTS,'dataset','store.pkl'))

df_submission_sample = pd.read_pickle(os.path.join(PATH_RESULTS,'dataset','submission_sample.pkl'))

N_submission = df_submission_sample.shape[0]
N_sales      = df_sales.shape[0]

df_sales = df_sales.merge(df_items,on=['I100'])
df_sales = df_sales.merge(df_stores,on=['S100'])
    
df_submission_sample = df_submission_sample.merge(df_items,on=['I100'])
df_submission_sample = df_submission_sample.merge(df_stores,on=['S100'])

df_sales.replace([np.inf, -np.inf, np.nan],0,inplace=True)
df_submission_sample.replace([np.inf, -np.inf, np.nan],0,inplace=True)

features_names = set(df_sales.columns)-set(['QTT','DATE','item_id','ID'])
print(features_names)

KeyboardInterrupt: 

# Feature engineering

In [None]:
df_sales             = fe_dates(df_sales)
df_submission_sample = fe_dates(df_submission_sample)

In [None]:
df_sales             = string_to_categorical(df_sales).copy()
df_submission_sample = string_to_categorical(df_submission_sample).copy()

In [None]:
df_submission_sample['DATE'].min(),df_submission_sample['DATE'].max()

In [None]:
#df_sales2 = df_sales[df_sales['QTT']<=15].copy()

In [None]:
#df_submission_sample = df_submission_sample[df_submission_sample['S100']==0]

In [None]:
features_names = set(df_sales.columns)-set(['QTT','DATE','item_id','ID'])
print(features_names)

In [None]:
from sklearn.model_selection import train_test_split

if False:
    X_train      = df_sales[(df_sales['DATE'] >= '2019-01-01') & (df_sales['DATE'] < '2020-01-01')].copy()
    X_validation = df_sales[(df_sales['DATE'] >= '2020-01-01') & (df_sales['DATE'] < '2020-02-01')].copy()
    X_test       = df_sales[(df_sales['DATE'] >= '2021-01-01') & (df_sales['DATE'] < '2022-01-03')].copy() 
    
    #X_train = X_train[X_train['QTT']<=350].copy()

    y_train = X_train['QTT']
    X_train = X_train[features_names]

    y_validation = X_validation['QTT']
    X_validation = X_validation[features_names]

    y_test = X_test['QTT']
    X_test = X_test[features_names]

else:
    X_train      = df_sales[(df_sales['DATE'] >= '2017-01-01') & (df_sales['DATE'] < '2019-01-01')].copy()
    X_train      = pd.concat([X_train,df_sales[(df_sales['DATE'] >= '2021-01-01') & (df_sales['DATE'] < '2021-08-01')].copy()])
    
    X_test       = df_sales[(df_sales['DATE'] >= '2021-08-01') & (df_sales['DATE'] < '2022-10-03')].copy() 

    y_train = X_train['QTT']
    X_train = X_train[features_names]

    y_test = X_test['QTT']
    X_test = X_test[features_names]

    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [None]:
df_sales['DATE'].max()

In [None]:
y_submission = df_submission_sample['QTT']
X_submission = df_submission_sample[features_names]

In [None]:
y_validation.hist(bins=100)


In [None]:
y_test.hist(bins=100)


In [None]:
#X_test[y_test.values>25]['S100'].value_counts()

In [None]:
import lightgbm as lgb

hyper_params = {
    'device':'gpu',
    'is_unbalance':True,
    'is_enable_sparse':True,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['rmse'],
    'verbose': -1,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    "max_depth": 8,
    "num_leaves": 128,  
    "max_bin": 128,
    "num_iterations": 800
}

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_validation, y_validation)

In [None]:
gbm = lgb.train(hyper_params, lgb_train, 3000, valid_sets=[lgb_train, lgb_valid],early_stopping_rounds=50, verbose_eval=50)

In [None]:
X_train_pred      = gbm.predict(X_train)#, num_iteration=gbm.best_iteration_)
X_validation_pred = gbm.predict(X_validation)#, num_iteration=gbm.best_iteration_)
X_test_pred       = gbm.predict(X_test)#, num_iteration=gbm.best_iteration_)

X_submission_pred = gbm.predict(X_submission)#, num_iteration=gbm.best_iteration_)

In [None]:
from sklearn.metrics import mean_squared_error

X_train_pred = np.maximum(X_train_pred, 0)
X_validation_pred = np.maximum(X_validation_pred, 0)
X_test_pred = np.maximum(X_test_pred, 0)
X_submission_pred = np.maximum(X_submission_pred, 0)

df_train_rmse = mean_squared_error(y_train,X_train_pred, squared=False)
df_val_rmse   = mean_squared_error(y_validation,X_validation_pred, squared=False)
df_test_rmse  = mean_squared_error(y_test,X_test_pred, squared=False)

print('Final score mean_squared_error')
print('Score train:',df_train_rmse)
print('Score val  :',df_val_rmse)
print('Score test :',df_test_rmse)

In [None]:
#X_test_pred[0] = 200

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(20,8))
plt.plot(X_test_pred,'b', alpha=0.7)
plt.plot(y_test.values,'r', alpha=0.4)
plt.legend(["prediction", "real"], loc ="upper right")

plt.show()

In [None]:
df_submission_sample.reset_index(drop=True,inplace=True)
df_submission_sample['QTT'] = X_submission_pred
df_submission_sample.head()

In [None]:
filename_submission= 'submission_baseline01_'+str(np.round(df_test_rmse,4))+'_local.csv'
save = True
if save:
    print('saving..')
    print(filename_submission)
    df_submission_sample[['ID','QTT']].to_csv(os.path.join(PATH_RESULTS,'submissions',filename_submission),index=None)

In [None]:
result = pd.read_csv(os.path.join(PATH_RESULTS,'submissions','submission_baseline01_2.8266_local_4.19559_kaggle.csv'))

In [None]:
df_submission_sample['QTT_reference']= result['QTT'].copy()
df_submission_sample[['QTT_reference','QTT']][-20:]

In [None]:
fig = plt.figure(figsize=(20,8))
plt.plot(df_submission_sample['QTT'],'b', alpha=0.7)
plt.plot(result['QTT'],'r', alpha=0.4)
plt.legend(["prediction", "real"], loc ="upper right")
plt.show()

In [None]:
difference = mean_squared_error(df_submission_sample['QTT'],result['QTT'], squared=False)

print('Final score mean_squared_error')
print('Score train:',difference)