In [141]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score

In [176]:
train_df = pd.read_csv('./input/train.csv')
test_df = pd.read_csv('./input/test.csv')

In [76]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Memory usage decreased from {:5.2f}MB to {:5.2f}MB ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [77]:
reduce_mem_usage(train_df)

Memory usage decreased from  0.32MB to  0.05MB (83.3% reduction)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.199219,0,1,0,7,0.600098,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.500000,1,0,1,53,0.700195,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.500000,1,2,1,41,0.899902,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.500000,0,0,0,10,0.799805,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.200195,0,13,1,44,0.600098,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.500000,1,0,1,2,0.799805,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.599609,1,0,0,39,0.199951,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.899902,1,1,1,36,0.700195,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.899902,0,4,1,46,0.099976,145,5,...,336,670,869,18,10,19,1,1,1,0


In [78]:
reduce_mem_usage(testet_df)

Memory usage decreased from  0.16MB to  0.03MB (82.7% reduction)


Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.799805,1,14,0,5,0.099976,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.500000,1,4,1,61,0.799805,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.800781,0,1,0,27,0.899902,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.500000,1,18,1,25,0.500000,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.400391,0,11,1,49,0.500000,108,...,18,749,810,1773,15,8,7,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,1700,1,1.900391,0,0,1,54,0.500000,170,...,17,644,913,2121,14,8,15,1,1,0
996,997,609,0,1.799805,1,0,0,13,0.899902,186,...,2,1152,1632,1933,8,1,19,0,1,1
997,998,1185,0,1.400391,0,1,1,8,0.500000,80,...,12,477,825,1223,5,0,14,1,0,0
998,999,1533,1,0.500000,1,0,0,50,0.399902,171,...,12,38,832,2509,15,11,6,0,1,0


In [79]:
X_train = train_df.drop('price_range', axis=1).values
y_train = train_df.price_range.values

In [81]:
print(X_train.shape, y_train.shape)

(2000, 20) (2000,)


In [87]:
X_test_id = test_df.id
X_test = test_df.drop('id', axis=1).values
y_test = np.zeros(1000)

In [91]:
print(X_test.shape, y_test.shape)

(1000, 20) (1000,)


In [96]:
import lightgbm as lgb
from bayes_opt import BayesianOptimization

In [115]:
np.unique(y_train)

array([0, 1, 2, 3], dtype=int8)

In [119]:
def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=3, 
                            random_seed=6, n_estimators=10000, output_process=False):
    d_train = lgb.Dataset(data=X_train, label=y_train, free_raw_data=False)
    def lgb_eval(learning_rate, num_leaves, feature_fraction, bagging_fraction, 
                 max_depth, max_bin, min_data_in_leaf, min_sum_hessian_in_leaf, subsample):
        params = {'application':'multiclass', 'num_class': 4, 'metric':'multi_logloss'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params['num_leaves'] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_bin))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgb.cv(params, d_train, nfold=n_folds, seed=random_seed, 
                           stratified=True, verbose_eval=200, metrics=['multi_logloss'])
        return max(cv_result['multi_logloss-mean'])
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.0001, 0.01),
                                            'num_leaves': (10, 50),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.2, 1.0),
                                            'max_depth': (2, 30),
                                            'max_bin': (10, 50),
                                            'min_data_in_leaf': (20, 80),
                                            'min_sum_hessian_in_leaf': (0, 100),
                                            'subsample': (0.01, 1.0)})
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    model_auc=[]
    for model in range(len(lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(X, y, init_round=5, opt_round=10, n_folds=3, random_seed=6, n_estimators=10000)

|   iter    |  target   | baggin... | featur... | learni... |  max_bin  | max_depth | min_da... | min_su... | num_le... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] 















| [0m 1       [0m | [0m 1.385   [0m | [0m 0.9303  [0m | [0m 0.2996  [0m | [0m 0.002391[0m | [0m 40.25   [0m | [0m 26.59   [0m | [0m 65.46   [0m | [0m 85.97   [0m | [0m 25.12   [0m | [0m 0.701   [0m |


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM



















| [95m 2       [0m | [95m 1.386   [0m | [95m 0.2977  [0m | [95m 0.1301  [0m | [95m 0.002741[0m | [95m 32.69   [0m | [95m 14.59   [0m | [95m 56.76   [0m | [95m 26.5    [0m | [95m 12.99   [0m | [95m 0.5197  [0m |
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1















| [0m 3       [0m | [0m 1.379   [0m | [0m 0.9403  [0m | [0m 0.7832  [0m | [0m 0.007632[0m | [0m 10.32   [0m | [0m 7.267   [0m | [0m 34.24   [0m | [0m 85.7    [0m | [0m 20.73   [0m | [0m 0.899   [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [I

















| [0m 4       [0m | [0m 1.379   [0m | [0m 0.7437  [0m | [0m 0.8796  [0m | [0m 0.005856[0m | [0m 33.11   [0m | [0m 8.91    [0m | [0m 65.46   [0m | [0m 33.94   [0m | [0m 31.09   [0m | [0m 0.3975  [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 428
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 428
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 428
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [I

[LightGBM] [Info] Start training from score -1.384796
[LightGBM] [Info] Start training from score -1.384796
[LightGBM] [Info] Start training from score -1.387795
[LightGBM] [Info] Start training from score -1.387795


















| [0m 5       [0m | [0m 1.375   [0m | [0m 0.8565  [0m | [0m 0.8364  [0m | [0m 0.008605[0m | [0m 45.9    [0m | [0m 22.78   [0m | [0m 29.02   [0m | [0m 51.86   [0m | [0m 30.0    [0m | [0m 0.7602  [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [I















| [0m 6       [0m | [0m 1.385   [0m | [0m 0.2081  [0m | [0m 0.7181  [0m | [0m 0.001198[0m | [0m 32.69   [0m | [0m 13.55   [0m | [0m 59.79   [0m | [0m 28.99   [0m | [0m 13.42   [0m | [0m 0.1679  [0m |


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] S















| [0m 8       [0m | [0m 1.381   [0m | [0m 0.2     [0m | [0m 0.1188  [0m | [0m 0.01    [0m | [0m 21.37   [0m | [0m 15.92   [0m | [0m 46.6    [0m | [0m 27.36   [0m | [0m 10.0    [0m | [0m 0.01    [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046


[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384796
[LightGBM] [Info] Start training from score -1.384796
[LightGBM] [Info] Start training from score -1.387795
[LightGBM] [Info] Start training from score -1.387795
















| [0m 9       [0m | [0m 1.386   [0m | [0m 0.728   [0m | [0m 0.1176  [0m | [0m 0.001168[0m | [0m 39.36   [0m | [0m 10.98   [0m | [0m 55.31   [0m | [0m 22.09   [0m | [0m 17.06   [0m | [0m 0.1741  [0m |


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] S

















| [0m 10      [0m | [0m 1.378   [0m | [0m 0.8026  [0m | [0m 0.5798  [0m | [0m 0.008237[0m | [0m 39.57   [0m | [0m 23.46   [0m | [0m 50.59   [0m | [0m 30.06   [0m | [0m 12.25   [0m | [0m 0.8893  [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [I





| [0m 11      [0m | [0m 1.386   [0m | [0m 0.5151  [0m | [0m 0.1     [0m | [0m 0.0001  [0m | [0m 32.27   [0m | [0m 8.525   [0m | [0m 58.41   [0m | [0m 19.96   [0m | [0m 11.23   [0m | [0m 0.9195  [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [I



















| [0m 12      [0m | [0m 1.385   [0m | [0m 0.262   [0m | [0m 0.2839  [0m | [0m 0.001856[0m | [0m 28.48   [0m | [0m 13.3    [0m | [0m 52.64   [0m | [0m 16.63   [0m | [0m 23.13   [0m | [0m 0.8162  [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[Li



















| [0m 13      [0m | [0m 1.386   [0m | [0m 0.343   [0m | [0m 0.5862  [0m | [0m 0.000696[0m | [0m 38.85   [0m | [0m 3.628   [0m | [0m 50.75   [0m | [0m 7.392   [0m | [0m 13.91   [0m | [0m 0.841   [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 334
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [I









| [0m 14      [0m | [0m 1.379   [0m | [0m 0.8783  [0m | [0m 0.6211  [0m | [0m 0.00718 [0m | [0m 30.1    [0m | [0m 3.004   [0m | [0m 61.29   [0m | [0m 3.311   [0m | [0m 23.45   [0m | [0m 0.964   [0m |


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 1333, number of used features: 20
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 1334, number of used features: 20
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] Start training from score -1.384046
[LightGBM] [Info] Start training from score -1.387045
[LightGBM] [Info] S















| [0m 15      [0m | [0m 1.386   [0m | [0m 0.2     [0m | [0m 0.1     [0m | [0m 0.0001  [0m | [0m 34.57   [0m | [0m 3.357   [0m | [0m 46.49   [0m | [0m 19.61   [0m | [0m 14.09   [0m | [0m 1.0     [0m |


In [159]:
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='binary'
opt_params[1]['metric']='auc'
opt_params[1]['is_unbalance']=True
opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params

KeyError: 1

In [120]:
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective']='multiclass'
opt_params[1]['metric']='multi_logloss'
opt_params[1]['is_unbalance']=True
opt_params[1]['boost_from_average']=False
opt_params=opt_params[1]
opt_params

{'bagging_fraction': 0.2977498730481892,
 'feature_fraction': 0.13010205447180587,
 'learning_rate': 0.002741077306790737,
 'max_bin': 33,
 'max_depth': 15,
 'min_data_in_leaf': 57,
 'min_sum_hessian_in_leaf': 26.49884100602067,
 'num_leaves': 13,
 'subsample': 0.5196592552731106,
 'objective': 'multiclass',
 'metric': 'multi_logloss',
 'is_unbalance': True,
 'boost_from_average': False}

In [122]:
X_train

array([[8.4200000e+02, 0.0000000e+00, 2.1992188e+00, ..., 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.0210000e+03, 1.0000000e+00, 5.0000000e-01, ..., 1.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [5.6300000e+02, 1.0000000e+00, 5.0000000e-01, ..., 1.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       ...,
       [1.9110000e+03, 0.0000000e+00, 8.9990234e-01, ..., 1.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.5120000e+03, 0.0000000e+00, 8.9990234e-01, ..., 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00],
       [5.1000000e+02, 1.0000000e+00, 2.0000000e+00, ..., 1.0000000e+00,
        1.0000000e+00, 1.0000000e+00]], dtype=float32)

In [125]:
train_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.199219,0,1,0,7,0.600098,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.500000,1,0,1,53,0.700195,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.500000,1,2,1,41,0.899902,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.500000,0,0,0,10,0.799805,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.200195,0,13,1,44,0.600098,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.500000,1,0,1,2,0.799805,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.599609,1,0,0,39,0.199951,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.899902,1,1,1,36,0.700195,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.899902,0,4,1,46,0.099976,145,5,...,336,670,869,18,10,19,1,1,1,0


In [145]:
train_df.shape

(2000, 21)

In [170]:
train_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.199219,0,1,0,7,0.600098,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.500000,1,0,1,53,0.700195,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.500000,1,2,1,41,0.899902,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.500000,0,0,0,10,0.799805,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.200195,0,13,1,44,0.600098,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.500000,1,0,1,2,0.799805,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.599609,1,0,0,39,0.199951,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.899902,1,1,1,36,0.700195,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.899902,0,4,1,46,0.099976,145,5,...,336,670,869,18,10,19,1,1,1,0


In [178]:
test_df

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,1700,1,1.9,0,0,1,54,0.5,170,...,17,644,913,2121,14,8,15,1,1,0
996,997,609,0,1.8,1,0,0,13,0.9,186,...,2,1152,1632,1933,8,1,19,0,1,1
997,998,1185,0,1.4,0,1,1,8,0.5,80,...,12,477,825,1223,5,0,14,1,0,0
998,999,1533,1,0.5,1,0,0,50,0.4,171,...,12,38,832,2509,15,11,6,0,1,0


In [179]:
features = [c for c in train_df.columns[:-1]]

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=31416)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df, y_train)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=y_train[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=y_train[val_idx])

    num_round = 15000
    clf = lgb.train(opt_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 250)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score_multiclass(y_train, oof)))

Fold 0
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in the train set: 1800, number of used features: 20
Training until validation scores don't improve for 250 rounds


















[500]	training's multi_logloss: 1.06771	valid_1's multi_logloss: 1.08686


























[1000]	training's multi_logloss: 0.873972	valid_1's multi_logloss: 0.907144






























[1500]	training's multi_logloss: 0.74649	valid_1's multi_logloss: 0.793569
































[2000]	training's multi_logloss: 0.641742	valid_1's multi_logloss: 0.696723
































[2500]	training's multi_logloss: 0.564835	valid_1's multi_logloss: 0.625525






























[3000]	training's multi_logloss: 0.509781	valid_1's multi_logloss: 0.575468
































[3500]	training's multi_logloss: 0.466745	valid_1's multi_logloss: 0.538243






























[4000]	training's multi_logloss: 0.430423	valid_1's multi_logloss: 0.506734
































[4500]	training's multi_logloss: 0.399441	valid_1's multi_logloss: 0.480728






























[5000]	training's multi_logloss: 0.372286	valid_1's multi_logloss: 0.45765






























[5500]	training's multi_logloss: 0.349798	valid_1's multi_logloss: 0.438556






























[6000]	training's multi_logloss: 0.330063	valid_1's multi_logloss: 0.422151






























[6500]	training's multi_logloss: 0.312256	valid_1's multi_logloss: 0.407285






























[7000]	training's multi_logloss: 0.295398	valid_1's multi_logloss: 0.39299




























[7500]	training's multi_logloss: 0.281542	valid_1's multi_logloss: 0.381945




























[8000]	training's multi_logloss: 0.269988	valid_1's multi_logloss: 0.372008


























[8500]	training's multi_logloss: 0.259554	valid_1's multi_logloss: 0.363123


























[9000]	training's multi_logloss: 0.250064	valid_1's multi_logloss: 0.355586




























[9500]	training's multi_logloss: 0.241335	valid_1's multi_logloss: 0.348111
























[10000]	training's multi_logloss: 0.233743	valid_1's multi_logloss: 0.341684


























[10500]	training's multi_logloss: 0.227168	valid_1's multi_logloss: 0.336179


























[11000]	training's multi_logloss: 0.220555	valid_1's multi_logloss: 0.33077


























[11500]	training's multi_logloss: 0.214908	valid_1's multi_logloss: 0.326665


























[12000]	training's multi_logloss: 0.20955	valid_1's multi_logloss: 0.322986
























[12500]	training's multi_logloss: 0.204506	valid_1's multi_logloss: 0.319795
























[13000]	training's multi_logloss: 0.200303	valid_1's multi_logloss: 0.316912
























[13500]	training's multi_logloss: 0.196225	valid_1's multi_logloss: 0.313938






















[14000]	training's multi_logloss: 0.192103	valid_1's multi_logloss: 0.311845


























[14500]	training's multi_logloss: 0.188401	valid_1's multi_logloss: 0.309451
























[15000]	training's multi_logloss: 0.184876	valid_1's multi_logloss: 0.307265
Did not meet early stopping. Best iteration is:
[15000]	training's multi_logloss: 0.184876	valid_1's multi_logloss: 0.307265


ValueError: shape mismatch: value array of shape (200,4) could not be broadcast to indexing result of shape (200,)