In [2]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

from sklearn.preprocessing import PolynomialFeatures

from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge

from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')
np.random.seed(4590)


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [23]:
train_X = pd.read_csv('./intermediate/train.csv') 
train_y = pd.read_csv('./intermediate/target.csv', header=-1)
test_X = pd.read_csv('./intermediate/test.csv')
ids = pd.read_csv('./intermediate/sample_submission.csv')['card_id']

In [25]:
train_X.head(10)

Unnamed: 0,feature_1,feature_2,feature_3,hist_month_nunique,hist_hour_nunique,hist_weekofyear_nunique,hist_dayofweek_nunique,hist_year_nunique,hist_subsector_id_nunique,hist_merchant_id_nunique,...,new_hist_purchase_date_average,new_hist_purchase_date_uptonow,dayofweek,weekofyear,month,elapsed_time,hist_first_buy,new_hist_first_buy,card_id_total,purchase_amount_total
0,0.013145,0.008752,0.011428,9,23,35,7,2,21,94,...,2.347826,277.0,3,22,6,609,26,277.0,283.0,-179.212942
1,0.010712,0.011385,0.010283,12,24,50,7,2,24,142,...,9.333333,307.0,6,52,1,760,5,396.0,356.0,-214.362071
2,0.01061,0.008752,0.010283,10,14,22,7,2,7,13,...,0.0,277.0,0,31,8,913,163,635.0,44.0,-29.867717
3,0.010712,0.014166,0.010283,6,16,20,7,2,13,50,...,5.857143,288.0,4,35,9,517,25,187.0,84.0,-54.145736
4,0.008058,0.014166,0.010283,4,22,17,7,2,17,66,...,1.583333,277.0,2,44,11,456,11,121.0,169.0,-68.613893
5,0.010712,0.008752,0.010283,7,13,16,6,1,8,22,...,7.75,502.0,3,35,9,882,131,348.0,37.0,-15.176296
6,0.010479,0.008752,0.011428,12,16,48,7,2,12,63,...,6.2,299.0,3,48,12,791,35,460.0,265.0,-189.024187
7,0.010479,0.008752,0.011428,4,9,8,7,1,6,9,...,4.0,346.0,4,35,9,517,27,158.0,25.0,-15.664147
8,0.01061,0.011385,0.010283,5,8,6,5,1,4,8,...,1.5,316.0,1,31,8,548,17,228.0,17.0,-9.971533
9,0.01061,0.008752,0.010283,12,14,33,7,2,13,59,...,2.333333,298.0,0,31,8,913,161,608.0,116.0,-79.13747


poly lasso

In [20]:
%%time

params = [{"poly__degree":[2], "lasso__alpha": [0.01, 0.1, 1, 10, 100]}]

poly_model = Pipeline([('poly', PolynomialFeatures(degree=3))
             ,('lasso', Lasso(normalize=True))])


grid = GridSearchCV(poly_model, cv=5, param_grid=params, scoring='neg_mean_squared_error')
grid.fit(train_X, train_y)

print('best parameter', grid.best_params_)
print(np.mean(np.sqrt(-cross_val_score(grid.best_estimator_, train_X, train_y, cv=5, scoring='neg_mean_squared_error'))))

predictions = grid.best_estimator_.predict(test_X)

# predicted_X.to_csv('./output/svr.csv', index=False)

sub_df = pd.DataFrame({"card_id":ids.values})
sub_df["target"] = predictions
sub_df.to_csv("./output/poly_lasso.csv", index=False)

best parameter {'lasso__alpha': 0.01, 'poly__degree': 2}
3.8503312239493623
CPU times: user 24min 50s, sys: 9min 27s, total: 34min 17s
Wall time: 38min 2s


pca? + ridge

In [5]:
# %%time

# # pca = PCA(0.9)
# # train_X = pca.fit_transform(train_X)
# # test_X = pca.transform(test_X)
# params = [{"alpha":[0.01, 0.1, 10, 100]}]

# ridge = Ridge(normalize=True)

# grid = GridSearchCV(ridge, cv=5, param_grid=params, scoring='neg_mean_squared_error')
# grid.fit(train_X, train_y)

# print('best parameter', grid.best_params_)
# print(np.mean(np.sqrt(-cross_val_score(grid.best_estimator_, train_X, train_y, cv=5, scoring='neg_mean_squared_error'))))

# predictions = grid.best_estimator_.predict(test_X)

# # predicted_X.to_csv('./output/svr.csv', index=False)

# sub_df = pd.DataFrame({"card_id":ids.values})
# sub_df["target"] = predictions
# sub_df.to_csv("./output/ridge.csv", index=False)


pca? + lasso

In [6]:
# %%time

# # pca = PCA(0.9)
# # train_X = pca.fit_transform(train_X)
# # test_X = pca.transform(test_X)
# params = [{"alpha":[0.01, 0.1, 10, 100]}]

# lasso = Lasso(normalize=True)

# grid = GridSearchCV(lasso, cv=5, param_grid=params, scoring='neg_mean_squared_error')
# grid.fit(train_X, train_y)

# print('best parameter', grid.best_params_)
# print(np.mean(np.sqrt(-cross_val_score(grid.best_estimator_, train_X, train_y, cv=5, scoring='neg_mean_squared_error'))))

# predictions = grid.best_estimator_.predict(test_X)

# # predicted_X.to_csv('./output/svr.csv', index=False)

# sub_df = pd.DataFrame({"card_id":ids.values})
# sub_df["target"] = predictions
# sub_df.to_csv("./output/lasso.csv", index=False)


In [7]:
# %%time

# # params = [{"alpha": [1e0, 0.1, 1e-2], "gamma": np.logspace(-2, 2, 3)}]

# kernel_ridge = KernelRidge(kernel='rbf')

# grid = GridSearchCV(kernel_ridge, cv=5, param_grid=params, scoring='neg_mean_squared_error')
# grid.fit(train_X, train_y)

# print('best parameter', grid.best_params_)
# print(np.mean(np.sqrt(-cross_val_score(grid.best_estimator_, train_X, train_y, cv=5, scoring='neg_mean_squared_error'))))

# predictions = grid.best_estimator_.predict(test_X)

# # predicted_X.to_csv('./output/svr.csv', index=False)

# sub_df = pd.DataFrame({"card_id":ids.values})
# sub_df["target"] = predictions
# sub_df.to_csv("./output/kernel_ridge.csv", index=False)
