## Проект: прогнозирование количества продаж мерча каггла, в кагглском магазине, стране и в дате

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('Downloads/tabular-playground-series-jan-2022/train.csv')

In [4]:
# Посмотрим на то, как выглядит наш временной ряд
train

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911
...,...,...,...,...,...,...
26293,26293,2018-12-31,Sweden,KaggleMart,Kaggle Hat,823
26294,26294,2018-12-31,Sweden,KaggleMart,Kaggle Sticker,250
26295,26295,2018-12-31,Sweden,KaggleRama,Kaggle Mug,1004
26296,26296,2018-12-31,Sweden,KaggleRama,Kaggle Hat,1441


In [5]:
valid = pd.read_csv('Downloads/tabular-playground-series-jan-2022/test.csv')

In [7]:
valid

Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat
...,...,...,...,...,...
6565,32863,2019-12-31,Sweden,KaggleMart,Kaggle Hat
6566,32864,2019-12-31,Sweden,KaggleMart,Kaggle Sticker
6567,32865,2019-12-31,Sweden,KaggleRama,Kaggle Mug
6568,32866,2019-12-31,Sweden,KaggleRama,Kaggle Hat


In [8]:
# Посмотрим какие страны содержит датасет
train.country.unique()

array(['Finland', 'Norway', 'Sweden'], dtype=object)

In [9]:
# Посмотрим магазины каггла
train.store.unique()

array(['KaggleMart', 'KaggleRama'], dtype=object)

In [10]:
# Какие мерчы были проданы у каггла
train['product'].unique()

array(['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker'], dtype=object)

In [11]:
# Размернось train и valid
train.shape, valid.shape

((26298, 6), (6570, 5))

In [12]:
# Переводим дата и object в datetime
train.date=pd.to_datetime(train.date)
valid.date=pd.to_datetime(valid.date)

### 

#### Проведем небольшой EDA

In [13]:
train[train.country == 'Finland']

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911
...,...,...,...,...,...,...
26281,26281,2018-12-31,Finland,KaggleMart,Kaggle Hat,822
26282,26282,2018-12-31,Finland,KaggleMart,Kaggle Sticker,238
26283,26283,2018-12-31,Finland,KaggleRama,Kaggle Mug,831
26284,26284,2018-12-31,Finland,KaggleRama,Kaggle Hat,1231


In [14]:
train[(train.date.dt.month == 2) & (train.date.dt.day == 29)].index

Int64Index([7632, 7633, 7634, 7635, 7636, 7637, 7638, 7639, 7640, 7641, 7642,
            7643, 7644, 7645, 7646, 7647, 7648, 7649],
           dtype='int64')

In [15]:
train[(train.date.dt.month == 2) & (train.date.dt.day == 29)]

Unnamed: 0,row_id,date,country,store,product,num_sold
7632,7632,2016-02-29,Finland,KaggleMart,Kaggle Mug,178
7633,7633,2016-02-29,Finland,KaggleMart,Kaggle Hat,345
7634,7634,2016-02-29,Finland,KaggleMart,Kaggle Sticker,85
7635,7635,2016-02-29,Finland,KaggleRama,Kaggle Mug,297
7636,7636,2016-02-29,Finland,KaggleRama,Kaggle Hat,650
7637,7637,2016-02-29,Finland,KaggleRama,Kaggle Sticker,152
7638,7638,2016-02-29,Norway,KaggleMart,Kaggle Mug,315
7639,7639,2016-02-29,Norway,KaggleMart,Kaggle Hat,554
7640,7640,2016-02-29,Norway,KaggleMart,Kaggle Sticker,130
7641,7641,2016-02-29,Norway,KaggleRama,Kaggle Mug,524


#### Данная функция делит дату на 6 фичей: год, месяц и т.д.

In [16]:
def preprop(train):
    train['Year'] = train.date.dt.year
    train['Month'] = train.date.dt.month
    train['Day'] = train.date.dt.day
    train['day_of_year'] = train.date.dt.day_of_year
    train['day_of_week'] = train.date.dt.day_of_week
    train['day_of_month'] = train.date.dt.days_in_month
    return train

In [17]:
train.country.unique()

array(['Finland', 'Norway', 'Sweden'], dtype=object)

### Построим модель

In [18]:
# Предобработаем данные
train = preprop(train)
valid = preprop(valid)

In [553]:
train['product'].unique()

array(['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker'], dtype=object)

In [554]:
# Преобразуем все категориальные признаки в числовые
train = pd.get_dummies(train)

In [555]:
valid = pd.get_dummies(valid)

In [556]:
train.drop({'store_KaggleRama'}, axis = 1, inplace = True)
valid.drop({'store_KaggleRama'}, axis = 1, inplace = True)

In [557]:
train

Unnamed: 0,row_id,date,num_sold,Year,Month,Day,day_of_year,day_of_week,day_of_month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker
0,0,2015-01-01,329,2015,1,1,1,3,31,1,0,0,1,0,1,0
1,1,2015-01-01,520,2015,1,1,1,3,31,1,0,0,1,1,0,0
2,2,2015-01-01,146,2015,1,1,1,3,31,1,0,0,1,0,0,1
3,3,2015-01-01,572,2015,1,1,1,3,31,1,0,0,0,0,1,0
4,4,2015-01-01,911,2015,1,1,1,3,31,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26293,26293,2018-12-31,823,2018,12,31,365,0,31,0,0,1,1,1,0,0
26294,26294,2018-12-31,250,2018,12,31,365,0,31,0,0,1,1,0,0,1
26295,26295,2018-12-31,1004,2018,12,31,365,0,31,0,0,1,0,0,1,0
26296,26296,2018-12-31,1441,2018,12,31,365,0,31,0,0,1,0,1,0,0


In [559]:
# Отсортируем по дате, чтоб потом разделить на train и тест
train = train.sort_values(by = ['date'])

In [560]:
train

Unnamed: 0,row_id,date,num_sold,Year,Month,Day,day_of_year,day_of_week,day_of_month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker
0,0,2015-01-01,329,2015,1,1,1,3,31,1,0,0,1,0,1,0
17,17,2015-01-01,324,2015,1,1,1,3,31,0,0,1,0,0,0,1
16,16,2015-01-01,1195,2015,1,1,1,3,31,0,0,1,0,1,0,0
15,15,2015-01-01,706,2015,1,1,1,3,31,0,0,1,0,0,1,0
14,14,2015-01-01,175,2015,1,1,1,3,31,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26281,26281,2018-12-31,822,2018,12,31,365,0,31,1,0,0,1,1,0,0
26280,26280,2018-12-31,469,2018,12,31,365,0,31,1,0,0,1,0,1,0
26296,26296,2018-12-31,1441,2018,12,31,365,0,31,0,0,1,0,1,0,0
26287,26287,2018-12-31,1124,2018,12,31,365,0,31,0,1,0,1,1,0,0


In [561]:
# 70 проц. на train и 30 процентов на тест
# Так как у нас временные ряды, делим на train и test именно таким образом. (Так мы можем обучим модель на предыдущих датах, чтоб он научился строить прогноз на будущих датах)
X_test = train[-int(train.shape[0] * 0.3):]
X_train = train[:-int(train.shape[0] * 0.3)]

In [562]:
X_test

Unnamed: 0,row_id,date,num_sold,Year,Month,Day,day_of_year,day_of_week,day_of_month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker
18397,18397,2017-10-19,273,2017,10,19,292,3,31,1,0,0,1,1,0,0
18398,18398,2017-10-19,90,2017,10,19,292,3,31,1,0,0,1,0,0,1
18399,18399,2017-10-19,347,2017,10,19,292,3,31,1,0,0,0,0,1,0
18400,18400,2017-10-19,444,2017,10,19,292,3,31,1,0,0,0,1,0,0
18401,18401,2017-10-19,148,2017,10,19,292,3,31,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26281,26281,2018-12-31,822,2018,12,31,365,0,31,1,0,0,1,1,0,0
26280,26280,2018-12-31,469,2018,12,31,365,0,31,1,0,0,1,0,1,0
26296,26296,2018-12-31,1441,2018,12,31,365,0,31,0,0,1,0,1,0,0
26287,26287,2018-12-31,1124,2018,12,31,365,0,31,0,1,0,1,1,0,0


In [563]:
y_test = X_test.num_sold
y_train = X_train.num_sold

In [564]:
X_train

Unnamed: 0,row_id,date,num_sold,Year,Month,Day,day_of_year,day_of_week,day_of_month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker
0,0,2015-01-01,329,2015,1,1,1,3,31,1,0,0,1,0,1,0
17,17,2015-01-01,324,2015,1,1,1,3,31,0,0,1,0,0,0,1
16,16,2015-01-01,1195,2015,1,1,1,3,31,0,0,1,0,1,0,0
15,15,2015-01-01,706,2015,1,1,1,3,31,0,0,1,0,0,1,0
14,14,2015-01-01,175,2015,1,1,1,3,31,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18413,18413,2017-10-19,190,2017,10,19,292,3,31,0,0,1,0,0,0,1
18404,18404,2017-10-19,168,2017,10,19,292,3,31,0,1,0,1,0,0,1
18402,18402,2017-10-19,295,2017,10,19,292,3,31,0,1,0,1,0,1,0
18403,18403,2017-10-19,428,2017,10,19,292,3,31,0,1,0,1,1,0,0


In [565]:
X_train.drop({"row_id", "date", "num_sold"}, axis = 1, inplace=True)
X_test.drop({"row_id", "date", "num_sold"}, axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [566]:
valid.drop({"row_id", "date"}, axis = 1, inplace=True)

In [567]:
valid

Unnamed: 0,Year,Month,Day,day_of_year,day_of_week,day_of_month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker
0,2019,1,1,1,1,31,1,0,0,1,0,1,0
1,2019,1,1,1,1,31,1,0,0,1,1,0,0
2,2019,1,1,1,1,31,1,0,0,1,0,0,1
3,2019,1,1,1,1,31,1,0,0,0,0,1,0
4,2019,1,1,1,1,31,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6565,2019,12,31,365,1,31,0,0,1,1,1,0,0
6566,2019,12,31,365,1,31,0,0,1,1,0,0,1
6567,2019,12,31,365,1,31,0,0,1,0,0,1,0
6568,2019,12,31,365,1,31,0,0,1,0,1,0,0


In [568]:
# Используем SMAPE в качестве метрики
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.round(np.mean(diff),5)

In [569]:
X_train

Unnamed: 0,Year,Month,Day,day_of_year,day_of_week,day_of_month,country_Finland,country_Norway,country_Sweden,store_KaggleMart,product_Kaggle Hat,product_Kaggle Mug,product_Kaggle Sticker
0,2015,1,1,1,3,31,1,0,0,1,0,1,0
17,2015,1,1,1,3,31,0,0,1,0,0,0,1
16,2015,1,1,1,3,31,0,0,1,0,1,0,0
15,2015,1,1,1,3,31,0,0,1,0,0,1,0
14,2015,1,1,1,3,31,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18413,2017,10,19,292,3,31,0,0,1,0,0,0,1
18404,2017,10,19,292,3,31,0,1,0,1,0,0,1
18402,2017,10,19,292,3,31,0,1,0,1,0,1,0
18403,2017,10,19,292,3,31,0,1,0,1,1,0,0


In [581]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2


In [583]:
# Попробуем обучить на нескольких моделях, чтобы в конце выбрать наилучшую
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb

In [584]:
model = xgb.XGBRegressor()
model2 = lgb.LGBMRegressor()
gb = GradientBoostingRegressor(max_depth=5, learning_rate=0.9)
rf = RandomForestRegressor(max_depth=5)
lr = LinearRegression()

In [597]:
stack_regressor = StackingRegressor(estimators= [('rf', rf), ('gb', gb), ('xgb', model)], final_estimator= lgb.LGBMRegressor())

In [598]:
stack_regressor.fit(X_train, y_train)

StackingRegressor(estimators=[('rf', RandomForestRegressor(max_depth=5)),
                              ('gb',
                               GradientBoostingRegressor(learning_rate=0.9,
                                                         max_depth=5)),
                              ('xgb',
                               XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=None,
                                            gpu...
                                            interaction_constraints=None,
                         

In [599]:
SMAPE(y_test, stack_regressor.predict(X_test))

8.81312

In [589]:
SMAPE(y_test, stack_regressor.predict(X_test))

8.89791

In [593]:
stack_regressor.fit(train.drop({'date', 'num_sold', 'row_id'}, axis = 1), train.num_sold)

StackingRegressor(estimators=[('rf', RandomForestRegressor(max_depth=5)),
                              ('gb',
                               GradientBoostingRegressor(learning_rate=0.9,
                                                         max_depth=5)),
                              ('xgb',
                               XGBRegressor(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None, gamma=None,
                                            gpu...
                                            interaction_constraints=None,
                         

In [600]:
y_valid = stack_regressor.predict(valid)

In [601]:
ans = pd.read_csv("sample_submission (1).csv")
ans.num_sold = y_valid

In [602]:
ans.to_csv("sample_submission.csv", index = False)

#### Попробуем теперь через train_test_split, проверим как она себя покажет

In [329]:
X_train, X_test, y_train, y_test = train_test_split(train.drop({'date', 'num_sold', 'row_id'}, axis = 1), train.num_sold, test_size = 0.25, random_state = 17)

In [330]:
stack_regressor.fit(X_train, y_train)

StackingRegressor(estimators=[('rf', RandomForestRegressor(max_depth=5)),
                              ('gb',
                               GradientBoostingRegressor(learning_rate=0.9,
                                                         max_depth=5))])

In [331]:
SMAPE(y_test, stack_regressor.predict(X_test))

6.00059

#### Здесь можно увидеть, что train_test_split для таких обработалась лучше

In [333]:
# Такс, вот этим методом все плохо
y_valid = stack_regressor.predict(valid)
ans = pd.read_csv("sample_submission (1).csv")
ans.num_sold = y_valid
ans.to_csv("sample_submission.csv", index = False)