In [None]:
cd 'drive/MyDrive/Colab Notebooks/1091DS'

/content/drive/MyDrive/Colab Notebooks/1091DS


In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os
import gc
import pickle
from xgboost import XGBRegressor
from matplotlib.pylab import rcParams

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
rcParams['figure.figsize'] = 12, 4
sns.set_theme(style="whitegrid")

Read Data

In [None]:
data = pd.read_pickle("./preprocessing_data.pkl")
test = pd.read_csv("./test.csv")

In [None]:
data[data["date_block_num"]==34].shape

(214200, 33)

Preprocessing

In [None]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

In [None]:
del data
gc.collect()

0

In [None]:
X_train.fillna(0, inplace=True)
X_valid.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
Y_train.fillna(0, inplace=True)
Y_valid.fillna(0, inplace=True)

Model -- SVD Regressor

In [None]:
from sklearn.linear_model import SGDRegressor

In [None]:
# svd_regressor = SGDRegressor(eta0=0.1, epsilon=0.0001, loss='squared_epsilon_insensitive', learning_rate='adaptive', max_iter=2000)

In [None]:
# svd_regressor.fit(X_train, Y_train.ravel())

In [None]:
# pickle.dump(svd_regressor, open('SVD.model', 'wb'))

In [None]:
svd = pickle.load(open('./SVD.model', 'rb'))

In [None]:
svd_pred = svd.predict(X_train)
svd_pred_test = svd.predict(X_test)

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": svd_pred_test
})
submission.to_csv('./submission/svd_submission.csv', index=False)

Model -- Ridge

In [None]:
from sklearn.linear_model import Ridge
# r_regressor = Ridge(alpha=100)
# r_regressor.fit(X_train, Y_train)

In [None]:
# pickle.dump(r_regressor, open('Ridge.model', 'wb'))

In [None]:
ridge = pickle.load(open('./Ridge.model', 'rb'))

In [None]:
r_pred = ridge.predict(X_train)
r_pred_test = ridge.predict(X_test)

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": r_pred_test
})
submission.to_csv('./submission/ridge_submission.csv', index=False)

Feature Selection

In [None]:
from sklearn.feature_selection import SelectFromModel
# regressor = Ridge(alpha=100)
selection = SelectFromModel(ridge)
selection.fit(X_train, Y_train.ravel())

X_train_top = selection.transform(X_train)
X_valid_top = selection.transform(X_valid)
X_test_top = selection.transform(X_test)

  overwrite_a=True).T


Model -- Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor
# rf = RandomForestRegressor(n_estimators=30, max_depth=10, random_state=42, oob_score=True, verbose=10, n_jobs=-1)

After Feature Selection:
*   Duration: 38min -> 10min
*   RSME: 0.92 -> 1.01

In [None]:
# rf.fit(X_train_top, Y_train)

In [None]:
# pickle.dump(rf, open('RF.model', 'wb'))

In [None]:
rf = pickle.load(open('./RF.model', 'rb'))

In [None]:
rf_pred = rf.predict(X_train_top).clip(0, 20)
rf_pred_test = rf.predict(X_test_top).clip(0, 20)

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    1.7s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    3.9s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    5.6s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    8.6s
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:   11.8s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:    0.3s finished


In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": rf_pred_test
})
submission.to_csv('./submission/rf_submission.csv', index=False)

Model -- XGBoost

In [None]:
# model = XGBRegressor(
#     learning_rate = 0.1,
#     n_estimators = 100,
#     max_depth = 12,
#     min_child_weight = 5,
#     gamma = 0,
#     subsample = 0.8,  
#     colsample_bytree=0.8, 
#     objective = 'reg:squarederror',
#     reg_lambda = 1,
#     reg_alpha = 0,
#     scale_pos_weight = 1,
#     seed=42,
# )

In [None]:
# model_xgboost = model.fit(
#     X_train, 
#     Y_train, 
#     eval_metric="rmse", 
#     eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
#     verbose=True, 
#     early_stopping_rounds = 20)

In [None]:
# Create RandomizedSearchCV Object
# param_test = {
    # 'max_depth' : list([8,9,10,11,12]), # Max Depth:  9
    # 'min_child_weight' : list([4,5,6,7,8]), # Minimum Sum of the Instance Weight Hessian to Make a Child:  5
    # 'gamma':[i/10.0 for i in range(0,5)],
    # 'subsample':[i/10.0 for i in range(6,10)],
    # 'colsample_bytree':[i/10.0 for i in range(6,10)],
    # 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
    # 'reg_lambda':[]
# }
# model_rscv = RandomizedSearchCV(model, param_test, scoring = "neg_root_mean_squared_error", cv = 2, verbose = True, random_state = 42)

In [None]:
# model_xgboost = model_rscv.fit(
#     X_train, 
#     Y_train, 
#     eval_metric="rmse", 
#     eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
#     verbose=True, 
#     early_stopping_rounds = 2)

In [None]:
# pickle.dump(model_xgboost, open('XGB.model', 'wb'))

In [None]:
xgb = pickle.load(open('./XGBoost.model', 'rb'))

In [None]:
xgb_pred = xgb.predict(X_train).clip(0, 20)
xgb_pred_test = xgb.predict(X_test).clip(0, 20)

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": xgb_pred_test
})
submission.to_csv('./submission/xgb_submission.csv', index=False)

Final

In [None]:
from sklearn.linear_model import LinearRegression
meta_train = pd.DataFrame({'ID':X_train.index,'pred_1':xgb_pred,'pred_2':rf_pred,'pred_3':svd_pred,'pred_4':r_pred})
y_meta_train = Y_train
meta_model = LinearRegression()
meta_model.fit(meta_train[['pred_1','pred_2','pred_3','pred_4']],y_meta_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
# filename = 'metamodel.model'
# pickle.dump(meta_model, open(filename, 'wb'))

In [None]:
meta_model = pickle.load(open('./metamodel.model', 'rb'))

In [None]:
pred_1 = pd.read_csv('./submission/xgb_submission.csv')
pred_2 = pd.read_csv('./submission/rf_submission.csv')
pred_3 = pd.read_csv('./submission/svd_submission.csv')
pred_4 = pd.read_csv('./submission/ridge_submission.csv')

In [None]:
meta_test = pd.DataFrame({'ID':pred_1.ID,'pred_1':pred_1.item_cnt_month,'pred_2':pred_2.item_cnt_month,'pred_3':pred_3.item_cnt_month,'pred_4':pred_4.item_cnt_month})
X_meta_test = meta_test[['pred_1','pred_2','pred_3','pred_4']]

In [None]:
y_test = meta_model.predict(X_meta_test).clip(0, 20)

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_test
})
submission.to_csv('./submission/final_submission.csv', index=False)

Answer View

In [14]:
ans = pd.read_csv('submission/final_submission.csv')

In [15]:
ans

Unnamed: 0,ID,item_cnt_month
0,0,0.509988
1,1,0.381447
2,2,0.862875
3,3,0.473632
4,4,4.759768
...,...,...
214195,214195,0.058575
214196,214196,0.003565
214197,214197,0.056603
214198,214198,0.006932


NULL Model

In [None]:
mean = ans.item_cnt_month.mean()

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": mean
})
submission.to_csv('./submission/null_submission.csv', index=False)