In [1]:
# ch6/predict_future_sales_baseline.ipynb

import pandas as pd

# 데이터 경로
data_path = '/kaggle/input/competitive-data-science-predict-future-sales/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
items = pd.read_csv(data_path + 'items.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [2]:
def downcast_dtypes(df):
    ''''''
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object":
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast="integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    print("{:.1f}% 압축됨".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast_dtypes(df)

62.5% 압축됨
38.6% 압축됨
54.2% 압축됨
39.9% 압축됨
70.8% 압축됨


In [4]:
import numpy as np 
from itertools import product
import time

combinations = []
cols = ['date_block_num', 'shop_id', 'item_id']
for i in range(34):
    sales_train_i = sales_train[sales_train['date_block_num'] == i]
    combinations_i = list(product([i], sales_train_i['shop_id'].unique(), sales_train_i['item_id'].unique()))
    combinations.append(combinations_i)

all_data = pd.DataFrame(np.vstack(combinations), columns=cols)
all_data

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2552
2,0,59,2554
3,0,59,2555
4,0,59,2564
...,...,...,...
10913845,33,21,7635
10913846,33,21,7638
10913847,33,21,7640
10913848,33,21,7632


In [5]:
sales_train = sales_train.drop('date', axis=1)
group = sales_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': 'sum'})
group

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,item_cnt_day
date_block_num,shop_id,item_id,Unnamed: 3_level_1
0,0,32,6
0,0,33,3
0,0,35,1
0,0,43,1
0,0,51,2
...,...,...,...
33,59,22087,6
33,59,22088,2
33,59,22091,1
33,59,22100,1


In [6]:
group.columns = ["item_cnt_month"]
group = group.reset_index()
group

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,0,32,6
1,0,0,33,3
2,0,0,35,1
3,0,0,43,1
4,0,0,51,2
...,...,...,...,...
1609119,33,59,22087,6
1609120,33,59,22088,2
1609121,33,59,22091,1
1609122,33,59,22100,1


In [7]:
all_data = pd.merge(all_data, group, on=cols, how='left')
all_data['item_cnt_month'] = all_data['item_cnt_month'].fillna(0)
all_data

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0
...,...,...,...,...
10913845,33,21,7635,0.0
10913846,33,21,7638,0.0
10913847,33,21,7640,0.0
10913848,33,21,7632,0.0


In [8]:
test['date_block_num'] = 34

all_data = pd.concat([all_data, test.drop(["ID"], axis = 1)], ignore_index=True, sort=False, keys=cols)
all_data = all_data.fillna(0)
all_data

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month
0,0,59,22154,1.0
1,0,59,2552,0.0
2,0,59,2554,0.0
3,0,59,2555,0.0
4,0,59,2564,0.0
...,...,...,...,...
11128045,34,45,18454,0.0
11128046,34,45,16188,0.0
11128047,34,45,15757,0.0
11128048,34,45,19648,0.0


In [9]:
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')
all_data

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,shop_name,item_name,item_category_id,item_category_name
0,0,59,22154,1.0,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,0,59,2552,0.0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE The House Of Blue Light LP,58,Музыка - Винил
2,0,59,2554,0.0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Who Do You Think We Are LP,58,Музыка - Винил
3,0,59,2555,0.0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,Музыка - CD фирменного производства
4,0,59,2564,0.0,"Ярославль ТЦ ""Альтаир""",DEEP PURPLE Perihelion: Live In Concert DVD (К...,59,Музыка - Музыкальное видео
...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,"Самара ТЦ ""ПаркХаус""",СБ. Союз 55,55,Музыка - CD локального производства
11128046,34,45,16188,0.0,"Самара ТЦ ""ПаркХаус""",Настольная игра Нано Кёрлинг,64,Подарки - Настольные игры
11128047,34,45,15757,0.0,"Самара ТЦ ""ПаркХаус""",НОВИКОВ АЛЕКСАНДР Новая коллекция,55,Музыка - CD локального производства
11128048,34,45,19648,0.0,"Самара ТЦ ""ПаркХаус""",ТЕРЕМ - ТЕРЕМОК сб.м/ф (Регион),40,Кино - DVD


In [10]:
from sklearn.preprocessing import LabelEncoder

for col in ['shop_name', 'item_name', 'item_category_name']:
    all_data[col] = LabelEncoder().fit_transform(all_data[col])
all_data

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,shop_name,item_name,item_category_id,item_category_name
0,0,59,22154,1.0,59,22154,37,37
1,0,59,2552,0.0,59,2552,58,58
2,0,59,2554,0.0,59,2554,58,58
3,0,59,2555,0.0,59,2555,56,56
4,0,59,2564,0.0,59,2564,59,59
...,...,...,...,...,...,...,...,...
11128045,34,45,18454,0.0,45,18454,55,55
11128046,34,45,16188,0.0,45,16188,64,64
11128047,34,45,15757,0.0,45,15757,55,55
11128048,34,45,19648,0.0,45,19648,40,40


In [11]:
all_data = downcast_dtypes(all_data)

70.8% 압축됨


In [12]:
# 훈련 데이터
X_train = all_data[all_data['date_block_num'] < 33].drop(['item_cnt_month'], axis=1)
y_train = all_data[all_data['date_block_num'] < 33]['item_cnt_month']
# 검증 데이터
X_valid = all_data[all_data['date_block_num'] == 33].drop(['item_cnt_month'], axis=1)
y_valid = all_data[all_data['date_block_num'] == 33]['item_cnt_month']
# 테스트 데이터
X_test = all_data[all_data['date_block_num'] == 34].drop(['item_cnt_month'], axis=1)

In [13]:
import gc

del all_data
gc.collect()

60

In [14]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape

((10675678, 7), (10675678,), (238172, 7), (238172,), (214200, 7))

In [15]:
y_train = y_train.clip(0, 20)
y_valid = y_valid.clip(0, 20)

In [16]:
import lightgbm as lgb

params = {'metric': 'rmse',
          'learning_rate': 0.05,
          'force_col_wise': True,
          'random_state': 42}

cat_features = ['shop_id', 'item_id', 'shop_name', 'item_name', 'item_category_id','item_category_name']

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)

lgb_model = lgb.train(params=params,
                      train_set=dtrain,
                      num_boost_round=500,
                      valid_sets=(dtrain, dvalid),
                      early_stopping_rounds=50,
                      categorical_feature=cat_features,
                      verbose_eval=50)      

New categorical_feature is ['item_category_id', 'item_category_name', 'item_id', 'item_name', 'shop_id', 'shop_name']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[LightGBM] [Info] Total Bins 32873
[LightGBM] [Info] Number of data points in the train set: 10675678, number of used features: 7
[LightGBM] [Info] Start training from score 0.299125




Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.99762	valid_1's rmse: 1.08042
Early stopping, best iteration is:
[19]	training's rmse: 1.07371	valid_1's rmse: 1.07601


In [17]:
submission['item_cnt_month'] = lgb_model.predict(X_test).clip(0,20)
submission.to_csv('baseline_submission.csv', index=False)

In [18]:
del X_train, y_train, X_valid, y_valid, X_test, dtrain, dvalid, submission, lgb_model
gc.collect()

119