In [1]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import gc
import lightgbm as lgb
import joblib
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [2]:
calendar = pd.read_csv("data/calendar.csv")
train_eva = pd.read_csv("data/sales_train_evaluation.csv")
sell_prices = pd.read_csv("data/sell_prices.csv")
sample_sub = pd.read_csv("data/sample_submission.csv")

In [5]:
years = range(1942, 1970)
train_eva_cols = ['d_' + str(d) for d in years]
train_eva[train_eva_cols] = np.zeros((len(train_eva), len(train_eva_cols)), dtype=np.int16)

In [6]:
def downcast(df):
    df = df.copy()
    dtypes = df.dtypes

    for col in df.columns:
        if 'int' in str(dtypes[col]):
            df[col] = pd.to_numeric(df[col], downcast='integer')
        elif 'float' in str(dtypes[col]):
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif dtypes[col] == np.object:
            if col == 'date':
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d')
            else:
                df[col] = df[col].astype('category')

    return df

In [15]:
train_eva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1975 entries, id to d_1969
dtypes: int64(1969), object(6)
memory usage: 459.4+ MB


In [7]:
print("Downcasting data")
train_eva = downcast(train_eva)
sell_prices = downcast(sell_prices)
calendar = downcast(calendar)

Downcasting data
CPU times: total: 3min 33s
Wall time: 3min 37s


In [17]:
train_eva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1975 entries, id to d_1969
dtypes: category(6), int16(1299), int8(670)
memory usage: 96.6 MB


In [8]:
print("Melting data")
df = pd.melt(frame=train_eva, 
             id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
             var_name="d", value_name="sold")

Melting data


In [9]:
print("Merging data")
df = pd.merge(left=df, right=calendar, how="left", on="d")
df = pd.merge(left=df, right=sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")

Merging data
CPU times: total: 51.7 s
Wall time: 54.6 s


In [10]:
df["snap"] = (df["snap_CA"] + df["snap_TX"] + df["snap_WI"]).apply(lambda x: 1 if x >= 1 else 0).astype(np.int8)

df["d"] = df["d"].str[2:].astype(np.int16)

df["sell_price"] = df.groupby('id')['sell_price'].transform(lambda x: x.fillna(x.median()))

df["weekend"] = np.where(df["wday"] < 3, 1, 0).astype(np.int8)
df = df.drop(["date", "weekday", "wm_yr_wk", "event_name_2", "event_type_2", "snap_CA", "snap_TX", "snap_WI"], axis=1)

In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

d_id = dict(zip(df["id"].cat.codes, df["id"]))

df["id"] = label_encoder.fit_transform(df["id"])
df["item_id"] = label_encoder.fit_transform(df["item_id"])
df["dept_id"] = label_encoder.fit_transform(df["dept_id"])
df["cat_id"] = label_encoder.fit_transform(df["cat_id"])
df["store_id"] = label_encoder.fit_transform(df["store_id"])
df["state_id"] = label_encoder.fit_transform(df["state_id"])
df["event_name_1"] = label_encoder.fit_transform(df["event_name_1"])
df["event_type_1"] = label_encoder.fit_transform(df["event_type_1"])

In [13]:
d_id

{14370: 'HOBBIES_1_001_CA_1_evaluation',
 14380: 'HOBBIES_1_002_CA_1_evaluation',
 14390: 'HOBBIES_1_003_CA_1_evaluation',
 14400: 'HOBBIES_1_004_CA_1_evaluation',
 14410: 'HOBBIES_1_005_CA_1_evaluation',
 14420: 'HOBBIES_1_006_CA_1_evaluation',
 14430: 'HOBBIES_1_007_CA_1_evaluation',
 14440: 'HOBBIES_1_008_CA_1_evaluation',
 14450: 'HOBBIES_1_009_CA_1_evaluation',
 14460: 'HOBBIES_1_010_CA_1_evaluation',
 14470: 'HOBBIES_1_011_CA_1_evaluation',
 14480: 'HOBBIES_1_012_CA_1_evaluation',
 14490: 'HOBBIES_1_013_CA_1_evaluation',
 14500: 'HOBBIES_1_014_CA_1_evaluation',
 14510: 'HOBBIES_1_015_CA_1_evaluation',
 14520: 'HOBBIES_1_016_CA_1_evaluation',
 14530: 'HOBBIES_1_017_CA_1_evaluation',
 14540: 'HOBBIES_1_018_CA_1_evaluation',
 14550: 'HOBBIES_1_019_CA_1_evaluation',
 14560: 'HOBBIES_1_020_CA_1_evaluation',
 14570: 'HOBBIES_1_021_CA_1_evaluation',
 14580: 'HOBBIES_1_022_CA_1_evaluation',
 14590: 'HOBBIES_1_023_CA_1_evaluation',
 14600: 'HOBBIES_1_024_CA_1_evaluation',
 14610: 'HOBBIES

In [12]:
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sold,wday,month,year,event_name_1,event_type_1,sell_price,snap,weekend
0,14370,1437,3,1,0,0,1,0,1,1,2011,30,4,8.26,0,1
1,14380,1438,3,1,0,0,1,0,1,1,2011,30,4,3.97,0,1
2,14390,1439,3,1,0,0,1,0,1,1,2011,30,4,2.97,0,1
3,14400,1440,3,1,0,0,1,0,1,1,2011,30,4,4.64,0,1
4,14410,1441,3,1,0,0,1,0,1,1,2011,30,4,2.98,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60034805,14329,1432,2,0,9,2,1969,0,2,6,2016,16,3,2.98,0,1
60034806,14339,1433,2,0,9,2,1969,0,2,6,2016,16,3,2.48,0,1
60034807,14349,1434,2,0,9,2,1969,0,2,6,2016,16,3,3.98,0,1
60034808,14359,1435,2,0,9,2,1969,0,2,6,2016,16,3,1.28,0,1


In [14]:
print("Mean Encoding")
df["state_mean"] = df.groupby("state_id")["sold"].transform("mean").astype(np.float16)
df["store_mean"] = df.groupby("store_id")["sold"].transform("mean").astype(np.float16)
df["cat_mean"] = df.groupby("cat_id")["sold"].transform("mean").astype(np.float16)
df["dept_mean"] = df.groupby("dept_id")["sold"].transform("mean").astype(np.float16)
df["state_cat_mean"] = df.groupby(["state_id", "cat_id"])["sold"].transform("mean").astype(np.float16)
df["state_dept_mean"] = df.groupby(["state_id", "dept_id"])["sold"].transform("mean").astype(np.float16)
df["store_cat_mean"] = df.groupby(["store_id", "cat_id"])["sold"].transform("mean").astype(np.float16)
df["store_cat_mean"] = df.groupby(["dept_id", "cat_id"])["sold"].transform("mean").astype(np.float16)
df["item_id_mean"] = df.groupby("item_id")["sold"].transform("mean").astype(np.float16)
df["item_state_mean"] = df.groupby(["item_id", "state_id"])["sold"].transform("mean").astype(np.float16)
df["item_store_mean"] = df.groupby(["item_id", "store_id"])["sold"].transform("mean").astype(np.float16)

Mean Encoding
CPU times: total: 46.4 s
Wall time: 46.9 s


In [15]:
lags = [29, 30, 31, 32, 33, 34, 35, 40, 55, 60, 65, 180]
for lag in lags:
    df['sold_lag_' + str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].shift(lag).astype(np.float16)
    
df = df[df['d']>max(lags)]

In [17]:
df.to_pickle('data.pkl')
del df, calendar, sell_prices, train_eva
gc.collect()

48

In [18]:
# Build model GBM
def lightgbm_model(X_train, y_train, X_valid, y_valid):
    model = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=8,
        num_leaves=224,
        min_child_weight=300
    )
        
    model.fit(X_train, y_train, 
          eval_set=[(X_train,y_train),(X_valid,y_valid)], 
          verbose=20, 
          early_stopping_rounds=20,
          eval_metric='rmse')
    
    return model

def xgboost_model(X_train, y_train, X_valid, y_valid):
    model = XGBRegressor(
        n_estimators=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=8,
        min_child_weight=300
    )

    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              verbose=20,
              early_stopping_rounds=20,
              eval_metric='rmse')

    return model

def catboost_model(X_train, y_train, X_valid, y_valid):
    
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bylevel=0.8,
        depth=8,
        l2_leaf_reg=224,
        min_child_samples=300,
        eval_metric='RMSE'
    )
    
    model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
              verbose=20, early_stopping_rounds=20)
    
    return model

In [19]:
data = pd.read_pickle('data.pkl')
valid = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test = data[data['d']>=1942][['id','d','sold']]

CPU times: total: 6.39 s
Wall time: 8.84 s


In [20]:
for i in range(10):
    df = data[data["store_id"] == i]    
    #Create train set
    X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
    X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
    X_test = df[df["d"] >= 1942].drop("sold", axis=1)
       
    # Create model
    print(f"Train model for store {i}")
    print("--------")
    
    model = lightgbm_model(X_train, y_train, X_valid, y_valid)
    #model = xgboost_model(X_train, y_train, X_valid, y_valid)
    #model = catboost_model(X_train, y_train, X_valid, y_valid)
    
    print("--------")
    print(f"Predicting for store {i}")    
    # Validation predict
    pred_val = model.predict(X_valid)
    valid.loc[X_valid.index, "sold"] = pred_val
    pred_eva = model.predict(X_test)
    test.loc[X_test.index, "sold"] = pred_eva   
    print("--------")
    
    print("Saving model and clear memories")  
    print("--------")
    filename = f'model_store_{i}.pkl'
    # Save model and Clear memmory
    joblib.dump(model, filename)
    del model, X_train, y_train, X_valid, y_valid, X_test
    gc.collect()

Train model for store 0
--------
[20]	training's rmse: 2.5037	training's l2: 6.2685	valid_1's rmse: 2.28464	valid_1's l2: 5.21958
[40]	training's rmse: 2.47832	training's l2: 6.14207	valid_1's rmse: 2.28023	valid_1's l2: 5.19947
[60]	training's rmse: 2.45256	training's l2: 6.01503	valid_1's rmse: 2.27801	valid_1's l2: 5.18935
[80]	training's rmse: 2.42868	training's l2: 5.89847	valid_1's rmse: 2.27739	valid_1's l2: 5.18649
--------
Predicting for store 0
--------
Saving model and clear memories
--------
Train model for store 1
--------
[20]	training's rmse: 1.88205	training's l2: 3.54211	valid_1's rmse: 2.09767	valid_1's l2: 4.40021
[40]	training's rmse: 1.86348	training's l2: 3.47257	valid_1's rmse: 2.09004	valid_1's l2: 4.36828
[60]	training's rmse: 1.8472	training's l2: 3.41215	valid_1's rmse: 2.08884	valid_1's l2: 4.36327
--------
Predicting for store 1
--------
Saving model and clear memories
--------
Train model for store 2
--------
[20]	training's rmse: 3.5749	training's l2: 12.

In [21]:
sample_sub = sample_sub[["id"]]
f_col = [f"F{i}" for i in range(1,29)]
f_col.insert(0, "id")

valid["id"] = valid["id"].map(d_id)
valid = valid.pivot(index="id", columns="d", values="sold").reset_index()
valid["id"] = valid["id"].str.replace("evaluation", "validation")
out_val = pd.merge(left=sample_sub[:30490], right=valid, on="id")
out_val.columns=f_col

test["id"] = test["id"].map(d_id)
test = test.pivot(index="id", columns="d", values="sold").reset_index()
out_eva = pd.merge(left=sample_sub[30490:], right=test, on="id")
out_eva.columns=f_col

submit = pd.concat([out_val,out_eva], ignore_index=True)

In [22]:
submit

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.737821,0.739214,0.618026,0.409374,0.434809,0.532401,0.574502,0.497058,0.357214,...,0.417413,0.608392,1.077895,0.638572,0.524287,0.437897,0.480209,0.800919,0.776963,0.694525
1,HOBBIES_1_002_CA_1_validation,0.517442,0.424342,0.376557,0.393484,0.386226,0.553480,0.429200,0.281708,0.274687,...,0.231656,0.331134,0.267663,0.208777,0.185031,0.242531,0.316669,0.340452,0.404177,0.375451
2,HOBBIES_1_003_CA_1_validation,0.320508,0.241412,0.243882,0.195059,0.349210,0.274401,0.285683,0.306655,0.217021,...,0.401972,0.624455,0.556116,0.538986,0.462953,0.493915,0.536232,0.533927,0.499093,0.399817
3,HOBBIES_1_004_CA_1_validation,2.847385,2.326052,1.831858,1.974244,1.641929,3.278841,2.754822,1.665133,1.724348,...,2.118709,2.957944,2.898130,2.259407,2.231498,1.805736,1.556374,2.072538,3.243581,3.874095
4,HOBBIES_1_005_CA_1_validation,0.989660,0.992663,0.767647,0.839653,1.272917,1.846241,1.705305,1.516027,0.939189,...,1.175731,1.395733,1.425418,1.014876,0.846364,0.924748,0.948913,0.966665,1.492678,1.440547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.454512,0.398428,0.484861,0.401221,0.588730,0.772080,0.727178,0.367147,0.492920,...,0.560108,0.634305,0.693847,0.642489,0.529810,0.601463,0.594540,0.617179,0.618593,1.004693
60976,FOODS_3_824_WI_3_evaluation,0.262162,0.335127,0.346927,0.366667,0.486194,0.604629,0.612523,0.326290,0.407069,...,0.254110,0.266593,0.421255,0.258323,0.342986,0.242536,0.205839,0.368010,0.350904,0.430791
60977,FOODS_3_825_WI_3_evaluation,0.699650,0.612286,0.674204,0.680861,0.681592,0.839051,1.035743,0.730338,0.814191,...,1.263550,1.120115,1.170344,1.091583,1.115687,0.860299,0.586508,0.698515,0.913552,1.174625
60978,FOODS_3_826_WI_3_evaluation,0.806676,0.836485,0.884026,0.757005,1.138567,0.950184,1.124641,0.679002,0.716324,...,0.762616,1.116610,0.726812,0.624449,0.724952,0.898118,0.734016,0.992694,1.375784,0.947414
