In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
scaled_test_X=pd.read_csv('/Users/cheriehe/scaled_test_X.csv')
scaled_test_X.drop(columns=['Unnamed: 0'], inplace=True)
scaled_train_X=pd.read_csv('/Users/cheriehe/scaled_train_X.csv')
train_y=pd.read_csv('/Users/cheriehe/train_y.csv')
scaled_train_X.drop(columns=['Unnamed: 0'], inplace=True)
train_y.drop(columns=['Unnamed: 0'], inplace=True)

In [3]:
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [4]:
train_y.head()

Unnamed: 0,time_to_failure
0,1.430797
1,1.391499
2,1.353196
3,1.313798
4,1.2744


In [5]:
train_y=train_y['time_to_failure']

In [None]:
def bayes_parameter_opt_lgb(X, y, init_round=2, opt_round=8, n_folds=5, random_seed=6, n_estimators=20000, learning_rate=0.001, output_process=True):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y,free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_lambda, reg_alpha, min_split_gain, min_child_weight, 
                min_child_sample, max_bin, subsample_freq):
        params = {'objective':'regression','boosting_type': 'gbdt','nthread': -1, 'verbose': -1,\
                  'num_boost_round': n_estimators, 'learning_rate':learning_rate, \
                  'early_stopping_round':500}
        params['subsample_freq']=int(round(subsample_freq))
        params['min_child_sample']=int(round(min_child_sample))
        params['max_bin']=int(round(max_bin))
        params["num_leaves"] = int(round(num_leaves))
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
        params['subsample'] = max(min(subsample, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['reg_lambda'] = max(reg_lambda, 0)
        params['reg_alpha'] = max(reg_alpha, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, verbose_eval=1000, metrics=['rmse'])
        return -1.0 * np.mean(cv_result['rmse-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (20, 80),
                                            'colsample_bytree': (0.6, 1.0),
                                            'subsample': (0.6, 1.0),
                                            'max_depth': (-1, 8),
                                            'reg_lambda': (0, 1),
                                            'reg_alpha': (0, 1),
                                            'min_child_sample':(10,50),
                                            'max_bin':(180,500),
                                            'subsample_freq':(1,10),
                                            'min_split_gain': (0.1, 0.8),
                                            'min_child_weight': (3, 20)})
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

opt_params = bayes_parameter_opt_lgb(scaled_train_X, train_y, init_round=2, opt_round=8, n_folds=5, random_seed=6, n_estimators=20000, learning_rate=0.001,output_process=True)

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
train_columns = scaled_train_X.columns.values

In [None]:
    params = {'boosting_type': 'gbdt',
              'objective': 'regression',
              'metric':'mae',
              'learning_rate': 0.001,
              'num_leaves': 53, 
              'max_depth': 2,  
              'min_child_samples': 37,  
              'max_bin': 443,  
              'subsample': 0.6815,  
              'subsample_freq': 1,  
              'colsample_bytree':0.8397,  
              'min_split_gain': 0.2689 ,
              'min_child_weight': 11.3791,
              'reg_lambda':0.4544,
              'reg_alpha': 0.0361,
              'nthread': 8,
              'verbose': -1,}

In [None]:
oof = np.zeros(len(scaled_train_X))
predictions = np.zeros(len(scaled_test_X))
feature_importance_df = pd.DataFrame()
#run model
for fold_, (trn_idx, val_idx) in enumerate(folds.split(scaled_train_X,train_y.values)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = scaled_train_X.iloc[trn_idx], scaled_train_X.iloc[val_idx]
    y_tr, y_val = train_y.iloc[trn_idx], train_y.iloc[val_idx]
    dtrain = lgb.Dataset(data=X_tr, 
                         label=y_tr,
                         free_raw_data=False)
    dvalid = lgb.Dataset(data=X_val,
                         label=y_val,
                         free_raw_data=False)
    model = lgb.train(params, 
                    dtrain, 
                    valid_sets=[dtrain, dvalid], 
                    valid_names=['train','valid'],
                    num_boost_round=20000,
                    early_stopping_rounds=500,
                    verbose_eval=1000)
    oof[val_idx] = model.predict(X_val)
    predictions += model.predict(scaled_test_X) / folds.n_splits
    #feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = train_columns
    fold_importance_df["importance"] = model.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    del model, dtrain, dvalid
    gc.collect()
print('Full mae %.6f' % mean_absolute_error(train_y, oof))

In [None]:
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(14,26))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout
plt.show()

In [None]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id')
submission.time_to_failure = predictions
submission.to_csv('submission_lgb2.csv',index=True)

In [None]:
top_cols = list( feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index)

In [None]:
#  feature selection: 120, 100, 80, 60

In [None]:
# Taking less columns 
scaled_train_X1 = scaled_train_X[top_cols]
scaled_test_X1 = scaled_test_X[top_cols]

In [None]:
def bayes_parameter_opt_lgb(X, y, init_round=2, opt_round=8, n_folds=10, random_seed=6, n_estimators=20000, learning_rate=0.001, output_process=True):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y,free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_lambda, reg_alpha, min_split_gain, min_child_weight, 
                min_child_sample, max_bin, subsample_freq):
        params = {'objective':'regression','boosting_type': 'gbdt','nthread': -1, 'verbose': -1,\
                  'num_boost_round': n_estimators, 'learning_rate':learning_rate, \
                  'early_stopping_round':500}
        params['subsample_freq']=int(round(subsample_freq))
        params['min_child_sample']=int(round(min_child_sample))
        params['max_bin']=int(round(max_bin))
        params["num_leaves"] = int(round(num_leaves))
        params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
        params['subsample'] = max(min(subsample, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['reg_lambda'] = max(reg_lambda, 0)
        params['reg_alpha'] = max(reg_alpha, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=False, verbose_eval=1000, metrics=['rmse'])
        return -1.0 * np.mean(cv_result['rmse-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (20, 80),
                                            'colsample_bytree': (0.6, 1.0),
                                            'subsample': (0.6, 1.0),
                                            'max_depth': (-1, 8),
                                            'reg_lambda': (0, 1),
                                            'reg_alpha': (0, 1),
                                            'min_child_sample':(10,50),
                                            'max_bin':(180,500),
                                            'subsample_freq':(1,10),
                                            'min_split_gain': (0.1, 0.8),
                                            'min_child_weight': (3, 20)})
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)

opt_params = bayes_parameter_opt_lgb(scaled_train_X1, train_y, init_round=2, opt_round=8, n_folds=10, random_seed=6, n_estimators=20000, learning_rate=0.001,output_process=True)

In [None]:
n_fold = 10
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
train_columns = scaled_train_X1.columns.values

In [None]:
    params = {'boosting_type': 'gbdt',
              'objective': 'regression',
              'metric':'mae',
              'learning_rate': 0.001,
              'num_leaves': 47, 
              'max_depth': 2,  
              'min_child_samples': 32,  
              'max_bin': 383,  
              'subsample': 0.6911,  
              'subsample_freq': 8,  
              'colsample_bytree':.6596,  
              'min_split_gain': 0.690 ,
              'min_child_weight': 15.6388,
              'reg_lambda':0.4518,
              'reg_alpha': 0.9155,
              'nthread': 8,
              'verbose': -1,}

In [None]:
oof = np.zeros(len(scaled_train_X1))
predictions = np.zeros(len(scaled_test_X1))
feature_importance_df = pd.DataFrame()
#run model
for fold_, (trn_idx, val_idx) in enumerate(folds.split(scaled_train_X1,train_y.values)):
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = scaled_train_X1.iloc[trn_idx], scaled_train_X1.iloc[val_idx]
    y_tr, y_val = train_y.iloc[trn_idx], train_y.iloc[val_idx]
    dtrain = lgb.Dataset(data=X_tr, 
                         label=y_tr,
                         free_raw_data=False)
    dvalid = lgb.Dataset(data=X_val,
                         label=y_val,
                         free_raw_data=False)
    model = lgb.train(params, 
                    dtrain, 
                    valid_sets=[dtrain, dvalid], 
                    valid_names=['train','valid'],
                    num_boost_round=20000,
                    early_stopping_rounds=500,
                    verbose_eval=1000)
    oof[val_idx] = model.predict(X_val)
    predictions += model.predict(scaled_test_X1) / folds.n_splits
    #feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = train_columns
    fold_importance_df["importance"] = model.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    del model, dtrain, dvalid
    gc.collect()
print('Full mae %.6f' % mean_absolute_error(train_y, oof))

In [None]:
submission = pd.read_csv('sample_submission.csv', index_col='seg_id')
submission.time_to_failure = predictions
submission.to_csv('submission_lgb5.csv',index=True)

In [None]:
cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).index
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(14,26))
sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout
plt.show()