In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import KFold

import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score



In [2]:
import sys
sys.path.append('../lib')

import XgbAdjustmentHelper as xgbhelper
import SklearnModelCurve as curve_helper

In [21]:
munged_train_df = pd.read_csv('../data/offline/train.csv', index_col=0)
munged_test_df = pd.read_csv('../data/offline/test.csv', index_col=0)
y_train_df = pd.read_csv('../data/offline/y_train.csv', index_col=0)

X_train, X_test, y_train, y_test = train_test_split(munged_train_df.values, y_train_df['y'].values, test_size=0.5, random_state=1729)
X_all_train = munged_train_df.values
y_all_train = y_train_df['y'].values

In [22]:
y_train

array([ 0.08000104, -0.0851757 , -0.0139028 , ..., -0.0095644 ,
       -0.07907276,  0.10596074])

In [23]:
init_param = {
    'silent':True,
    'objective':'reg:linear',
    'seed':27,
    'max_depth':4,
}

adjust_param1 = {
    'learning_rate':[0.05, 0.01, 0.005],
    'n_estimators':[500, 700, 1000, 1300],
}


adjust_param2 = {
    'min_child_weight':[1, 3, 8],
}

adjust_param3 = {
    'subsample':[i/100.0 for i in range(85, 105, 5)],
    'colsample_bytree':[i/100.0 for i in range(85, 105, 5)],
}

adjust_param4 = {
    'reg_alpha':[1e-5, 1e-3,1e-1, 10],
}


adjust_params = [adjust_param1, adjust_param2, adjust_param3, adjust_param4]


# init_param = {
#     'learning_rate':0.1,
#     'n_estimators':100,
#     'silent':False,
#     'objective':'reg:linear',
#     'seed':27,
    
#     'min_child_weight':6,
#     'subsample':1,
#     'colsample_bytree':0.9,
#     'gamma':0
# }

# adjust_param = {
#     'max_depth':[3, 4],
# }

estimator = lgb.LGBMRegressor(max_depth=4)

param_grid = {
    'num_leaves':[9, 10, 11],
    'learning_rate': [0.005, 0.01],
    'n_estimators': [500, 700],
    'subsample':[0.90, 0.95]
    
}

gbm = GridSearchCV(estimator, param_grid)

gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

print('Best parameters found by grid search are:', gbm.best_params_)
print('Best score:%f'%(r2_score(y_test, y_pred)))

Best parameters found by grid search are: {'num_leaves': 9, 'n_estimators': 700, 'subsample': 0.95, 'learning_rate': 0.005}
Best score:0.588318


In [None]:
Best score:0.510306

In [24]:
gbm_model = lgb.LGBMRegressor(max_depth=4, num_leaves=9, learning_rate=0.005, n_estimators=700, subsample=0.95, seed=1729)

In [32]:
smooth = 5
y_mean=4.653474409882869

In [None]:
import 

In [33]:
def KFoldCV(model, num_fold, X_train, y_train):
    cv_results = []
    kf = KFold(n_splits=num_fold, random_state=27)
    for train_index, test_index in kf.split(X_train):
        X_sub_train = X_train[train_index]
        X_sub_test = X_train[test_index]
        y_sub_train = y_train[train_index]
        y_sub_test = y_train[test_index]
        model.fit(X_sub_train, y_sub_train)
        y_pred = model.predict(X_sub_test)
        
        score = r2_score(np.exp(y_sub_test+y_mean) - smooth, np.exp(y_pred+y_mean) - smooth)
        cv_results.append(score)
    print('mean:%f std:%f'%(np.mean(cv_results), np.std(cv_results)))

In [34]:
KFoldCV(gbm_model, 5, X_all_train, y_all_train)

mean:0.562733 std:0.075500


In [None]:
mean:0.562733 std:0.075500
mean:0.558508 std:0.070926

In [35]:
gbm.fit(X_all_train, y_all_train)
y_pred = np.exp(gbm.predict(munged_test_df.values) + y_mean) - smooth
output = pd.DataFrame({'id': munged_test_df.index, 'y': y_pred})
output.to_csv('../data/online/lightbgm-10-10-6-5_6-11-4_0.562733_0.075500.csv', index=False)