In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

train_features = np.load('./data/train_features.npy')
train_labels = np.load('./data/train_labels.npy')
test_features = np.load('./data/test_features.npy')

data_train = lgb.Dataset(train_features, label = train_labels)

num_inputs = 21
num_outputs = 39

params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,                   #以上不再调整
    'max_depth': 6,
    'num_leaves': 50,                 #常用数值，备调，step2
    'min_data_in_leaf' : 20,          #默认数值，备调，step3
    'feature_fraction': 0.8,          #常用数值，备调，step4
    'learning_rate': 0.1,             #默认数值，备调，step5
    }

Step1. num_boost_round

In [5]:
results = lgb.cv(params,
                 data_train,
                 num_boost_round = 1000,
                 nfold = 3,
                 shuffle = True,
                 early_stopping_rounds = 40,
                 verbose_eval = 20)
print('best num_boost_round:', len(results['multi_logloss-mean']))
print('last mean:', results['multi_logloss-mean'][-1])
print('last stdv:', results['multi_logloss-stdv'][-1])

[20]	cv_agg's multi_logloss: 2.50468 + 0.031806
[40]	cv_agg's multi_logloss: 2.43431 + 0.00721752
[60]	cv_agg's multi_logloss: 2.42023 + 0.00977275
[80]	cv_agg's multi_logloss: 2.4096 + 0.00525619
[100]	cv_agg's multi_logloss: 2.39575 + 0.00471342
[120]	cv_agg's multi_logloss: 2.39878 + 0.0136678
[140]	cv_agg's multi_logloss: 2.39512 + 0.0103291
[160]	cv_agg's multi_logloss: 2.38855 + 0.00680853
[180]	cv_agg's multi_logloss: 2.39569 + 0.0110776
[200]	cv_agg's multi_logloss: 2.3862 + 0.00131608
[220]	cv_agg's multi_logloss: 2.39397 + 0.0182124
[240]	cv_agg's multi_logloss: 2.39446 + 0.0215042
best num_boost_round: 214
last mean: 2.379645842199627
last stdv: 0.0024214570330422813


In [None]:
params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,                   #以上不再调整
    'max_depth': 6,
    'num_leaves': 50,                 #常用数值，备调，step2
    'min_data_in_leaf' : 20,          #默认数值，备调，step3
    'feature_fraction': 0.8,          #常用数值，备调，step4
    'learning_rate': 0.1,             #默认数值，备调，step5
    }
gbm = lgb.train(params, data_train, num_boost_round = 214)
gbm.save_model('../working/gbm(v1).txt')
testResult = gbm.predict(test_features)
sampleSubmission = pd.read_csv('../input/sf-crime/sampleSubmission.csv.zip')
Result_pd = pd.DataFrame(testResult,
                         index=sampleSubmission.index,
                         columns=sampleSubmission.columns[1:])
Result_pd.to_csv('../working/sampleSubmission(gbmv1).csv', index_label='Id')

提交

Step2. max_depth 和 num_leaves

In [17]:
for max_depth in range(3,8,2):
    for num_leaves in range(20, 70, 40):
        params['max_depth'] = max_depth
        params['num_leaves'] = num_leaves
        print('max_depth = %d , num_leaves = %d' %
              (params['max_depth'] , params['num_leaves']))
        results = lgb.cv(params,
                         data_train,
                         num_boost_round = 214,
                         nfold = 3,
                         shuffle = True,
                         early_stopping_rounds = 40,
                         verbose_eval = 100)
        print('last mean:', results['multi_logloss-mean'][-1])
        print('last stdv:', results['multi_logloss-stdv'][-1])

max_depth = 3 , num_leaves = 20
[100]	cv_agg's multi_logloss: 2.44552 + 0.00163549
[200]	cv_agg's multi_logloss: 2.42769 + 0.00962192
last mean: 2.4220655707599197
last stdv: 0.0065027223149882645
max_depth = 3 , num_leaves = 60
[100]	cv_agg's multi_logloss: 2.44552 + 0.00163549
[200]	cv_agg's multi_logloss: 2.42769 + 0.00962192
last mean: 2.4220655707599197
last stdv: 0.0065027223149882645
max_depth = 5 , num_leaves = 20
[100]	cv_agg's multi_logloss: 2.40996 + 0.00366558
[200]	cv_agg's multi_logloss: 2.39024 + 0.00372836
last mean: 2.3893025015374247
last stdv: 0.003230273930546185
max_depth = 5 , num_leaves = 60
[100]	cv_agg's multi_logloss: 2.41023 + 0.00888953
[200]	cv_agg's multi_logloss: 2.39048 + 0.00908936
last mean: 2.387051839466166
last stdv: 0.009956365968283616
max_depth = 7 , num_leaves = 20
last mean: 2.577312549960696
last stdv: 0.016977703463318776
max_depth = 7 , num_leaves = 60
last mean: 2.4587145258610956
last stdv: 0.008400881474445983


接着细调：

In [11]:
for max_depth in (6,):
    for num_leaves in (35, 45, 55):
        params['max_depth'] = max_depth
        params['num_leaves'] = num_leaves
        print('max_depth = %d , num_leaves = %d' %
              (params['max_depth'] , params['num_leaves']))
        results = lgb.cv(params,
                         data_train,
                         num_boost_round = 214,
                         nfold = 3,
                         shuffle = True,
                         early_stopping_rounds = 40,
                         verbose_eval = 100)
        print('last mean:', results['multi_logloss-mean'][-1])
        print('last stdv:', results['multi_logloss-stdv'][-1])

max_depth = 6 , num_leaves = 35
last mean: 2.4891922482617757
last stdv: 0.010804137678622594
max_depth = 6 , num_leaves = 45
last mean: 2.4337760882358275
last stdv: 0.023151302306342825
max_depth = 6 , num_leaves = 55
last mean: 2.4854409810898215
last stdv: 0.0020869982223483607


In [7]:
for max_depth in (6, 5):
    for num_leaves in (40, 50):
        params['max_depth'] = max_depth
        params['num_leaves'] = num_leaves
        #print(params)
        #'''
        print('max_depth = %d , num_leaves = %d' %
              (params['max_depth'] , params['num_leaves']))
        results = lgb.cv(params,
                         data_train,
                         num_boost_round = 214,
                         nfold = 3,
                         shuffle = True,
                         early_stopping_rounds = 40,
                         verbose_eval = 100)
        print('last mean:', results['multi_logloss-mean'][-1])
        print('last stdv:', results['multi_logloss-stdv'][-1])
        #'''

max_depth = 6 , num_leaves = 40
[100]	cv_agg's multi_logloss: 2.39696 + 0.0011884
last mean: 2.3864925333355216
last stdv: 0.004396506644347127
max_depth = 6 , num_leaves = 50
[100]	cv_agg's multi_logloss: 2.39575 + 0.00471342
[200]	cv_agg's multi_logloss: 2.3862 + 0.00131608
last mean: 2.379645842199627
last stdv: 0.0024214570330422813
max_depth = 5 , num_leaves = 40
[100]	cv_agg's multi_logloss: 2.41023 + 0.00888953
[200]	cv_agg's multi_logloss: 2.39048 + 0.00908936
last mean: 2.387051839466166
last stdv: 0.009956365968283616
max_depth = 5 , num_leaves = 50
[100]	cv_agg's multi_logloss: 2.41023 + 0.00888953
[200]	cv_agg's multi_logloss: 2.39048 + 0.00908936
last mean: 2.387051839466166
last stdv: 0.009956365968283616


In [9]:
for max_depth in (6,):
    for num_leaves in (38, 42, 48, 52):
        params['max_depth'] = max_depth
        params['num_leaves'] = num_leaves
        #print(params)
        #'''
        print('max_depth = %d , num_leaves = %d' %
              (params['max_depth'] , params['num_leaves']))
        results = lgb.cv(params,
                         data_train,
                         num_boost_round = 214,
                         nfold = 3,
                         shuffle = True,
                         early_stopping_rounds = 40,
                         verbose_eval = 100)
        print('last mean:', results['multi_logloss-mean'][-1])
        print('last stdv:', results['multi_logloss-stdv'][-1])
        #'''

max_depth = 6 , num_leaves = 38
[100]	cv_agg's multi_logloss: 2.3998 + 0.00411068
last mean: 2.3915816256398634
last stdv: 0.002051365366130591
max_depth = 6 , num_leaves = 42
[100]	cv_agg's multi_logloss: 2.41089 + 0.00653975
last mean: 2.3941764198379887
last stdv: 0.005884772657223358
max_depth = 6 , num_leaves = 48
[100]	cv_agg's multi_logloss: 2.40125 + 0.00905401
[200]	cv_agg's multi_logloss: 2.41403 + 0.0266957
last mean: 2.388492100373433
last stdv: 0.004900892114415105
max_depth = 6 , num_leaves = 52
last mean: 2.432784516485041
last stdv: 0.007515011426455112


In [3]:
for max_depth in (6,):
    for num_leaves in (49, 51, 39, 41):
        params['max_depth'] = max_depth
        params['num_leaves'] = num_leaves
        #print(params)
        #'''
        print('max_depth = %d , num_leaves = %d' %
              (params['max_depth'] , params['num_leaves']))
        results = lgb.cv(params,
                         data_train,
                         num_boost_round = 214,
                         nfold = 3,
                         shuffle = True,
                         early_stopping_rounds = 40,
                         verbose_eval = 100)
        print('last mean:', results['multi_logloss-mean'][-1])
        print('last stdv:', results['multi_logloss-stdv'][-1])
        #'''

max_depth = 6 , num_leaves = 49
[100]	cv_agg's multi_logloss: 2.45417 + 0.0735405
last mean: 2.440617942865168
last stdv: 0.037959388186554255
max_depth = 6 , num_leaves = 51
[100]	cv_agg's multi_logloss: 2.3997 + 0.00450754
[200]	cv_agg's multi_logloss: 2.38764 + 0.0061293
last mean: 2.3793165805708867
last stdv: 0.0016149213685509095
max_depth = 6 , num_leaves = 39
[100]	cv_agg's multi_logloss: 2.48038 + 0.070169
last mean: 2.428458342356551
last stdv: 0.01570288199424567
max_depth = 6 , num_leaves = 41
last mean: 2.4222211399544835
last stdv: 0.007000683981481878


In [5]:
for max_depth in (5,):
    for num_leaves in (10, 30):
        params['max_depth'] = max_depth
        params['num_leaves'] = num_leaves
        #print(params)
        #'''
        print('max_depth = %d , num_leaves = %d' %
              (params['max_depth'] , params['num_leaves']))
        results = lgb.cv(params,
                         data_train,
                         num_boost_round = 214,
                         nfold = 3,
                         shuffle = True,
                         early_stopping_rounds = 40,
                         verbose_eval = 100)
        print('last mean:', results['multi_logloss-mean'][-1])
        print('last stdv:', results['multi_logloss-stdv'][-1])
        #'''

max_depth = 5 , num_leaves = 10
last mean: 2.4980897550813435
last stdv: 0.0004637598353433645
max_depth = 5 , num_leaves = 30
[100]	cv_agg's multi_logloss: 2.41008 + 0.0121798
[200]	cv_agg's multi_logloss: 2.38782 + 0.00929667
last mean: 2.3878041620756467
last stdv: 0.008733961375833862


得到下表：

max_depth | num_leaves | loss-mean | loss-stdv
:-: | :-: | :-: | :-:
3 | 20 | 2.422065570759919 | 0.00650272231498826
3 | 60 | 2.422065570759919 | 0.00650272231498826
5 | 10 | 2.498089755081343 | 0.00046375983534336
5 | 20 | **2.389302501537424** | 0.00323027393054618
5 | 30 | **2.387804162075646** | 0.00873396137583386
5 | 40 | **2.387051839466166** | 0.00995636596828361
5 | 50 | **2.387051839466166** | 0.00995636596828361
5 | 60 | **2.387051839466166** | 0.00995636596828361
6 | 35 | 2.489192248261775 | 0.01080413767862259
6 | 38 | **2.391581625639863** | 0.00205136536613059
6 | 39 | 2.428458342356551 | 0.01570288199424567
6 | 40 | **2.386492533335521** | 0.00439650664434712
6 | 41 | 2.422221139954483 | 0.00700068398148187
6 | 42 | **2.394176419837988** | 0.00588477265722335
6 | 45 | 2.433776088235827 | 0.02315130230634282
6 | 48 | **2.388492100373433** | 0.00490089211441510
6 | 49 | 2.440617942865168 | 0.03795938818655425
6 | 50 | **2.379645842199627** | 0.00242145703304228
**6** | **51** | **2.379316580570886** | 0.00161492136855090
6 | 52 | 2.432784516485041 | 0.00751501142645511
6 | 55 | 2.485440981089821 | 0.00208699822234836
7 | 20 | 2.577312549960696 | 0.01697770346331877
7 | 60 | 2.458714525861095 | 0.00840088147444598

更新params

In [3]:
params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,
    'max_depth': 6,
    'num_leaves': 51,                 #以上不再调整
    'min_data_in_leaf' : 20,          #默认数值，备调，step3
    'feature_fraction': 0.8,          #常用数值，备调，step4
    'learning_rate': 0.1,             #默认数值，备调，step5
    }

Step3: min_data_in_leaf

In [6]:
for  min_data_in_leaf in (10, 30):
    params['min_data_in_leaf'] =  min_data_in_leaf
    print('min_data_in_leaf = %d' % (params['min_data_in_leaf']))
    #print(params)
    #'''
    results = lgb.cv(params,
                     data_train,
                     num_boost_round = 214,
                     nfold = 3,
                     shuffle = True,
                     early_stopping_rounds = 40,
                     verbose_eval = 100)
    print('last mean:', results['multi_logloss-mean'][-1])
    print('last stdv:', results['multi_logloss-stdv'][-1])
    #'''

min_data_in_leaf = 10
[100]	cv_agg's multi_logloss: 2.41101 + 0.00927644
last mean: 2.3988022363060515
last stdv: 0.008656683290247618
min_data_in_leaf = 30
[100]	cv_agg's multi_logloss: 2.39748 + 0.0100026
last mean: 2.383482452281396
last stdv: 0.0011820715015853234


In [7]:
for  min_data_in_leaf in (18, 19, 21, 22):
    params['min_data_in_leaf'] =  min_data_in_leaf
    print('min_data_in_leaf = %d' % (params['min_data_in_leaf']))
    #print(params)
    #'''
    results = lgb.cv(params,
                     data_train,
                     num_boost_round = 214,
                     nfold = 3,
                     shuffle = True,
                     early_stopping_rounds = 40,
                     verbose_eval = 100)
    print('last mean:', results['multi_logloss-mean'][-1])
    print('last stdv:', results['multi_logloss-stdv'][-1])
    #'''

min_data_in_leaf = 18
[100]	cv_agg's multi_logloss: 2.39957 + 0.003926
last mean: 2.3869818226693784
last stdv: 0.0033857389768613276
min_data_in_leaf = 19
[100]	cv_agg's multi_logloss: 2.46381 + 0.102631
last mean: 2.3956323643380633
last stdv: 0.002191965018621247
min_data_in_leaf = 21
[100]	cv_agg's multi_logloss: 2.39539 + 0.00630973
last mean: 2.3835612834535924
last stdv: 0.004011287921746836
min_data_in_leaf = 22
last mean: 2.4720458242466674
last stdv: 0.010899544962079137


In [9]:
for  min_data_in_leaf in (25, 28):
    params['min_data_in_leaf'] =  min_data_in_leaf
    print('min_data_in_leaf = %d' % (params['min_data_in_leaf']))
    #print(params)
    #'''
    results = lgb.cv(params,
                     data_train,
                     num_boost_round = 214,
                     nfold = 3,
                     shuffle = True,
                     early_stopping_rounds = 40,
                     verbose_eval = 100)
    print('last mean:', results['multi_logloss-mean'][-1])
    print('last stdv:', results['multi_logloss-stdv'][-1])
    #'''

min_data_in_leaf = 25
[100]	cv_agg's multi_logloss: 2.38841 + 0.00131585
[200]	cv_agg's multi_logloss: 2.38613 + 0.0149716
last mean: 2.375107782694854
last stdv: 0.004941879444657266
min_data_in_leaf = 28
last mean: 2.424026943571746
last stdv: 0.001265288744226837


In [11]:
for  min_data_in_leaf in (24, 26):
    params['min_data_in_leaf'] =  min_data_in_leaf
    print('min_data_in_leaf = %d' % (params['min_data_in_leaf']))
    #print(params)
    #'''
    results = lgb.cv(params,
                     data_train,
                     num_boost_round = 214,
                     nfold = 3,
                     shuffle = True,
                     early_stopping_rounds = 40,
                     verbose_eval = 100)
    print('last mean:', results['multi_logloss-mean'][-1])
    print('last stdv:', results['multi_logloss-stdv'][-1])
    #'''

min_data_in_leaf = 24
[100]	cv_agg's multi_logloss: 2.39916 + 0.00999138
[200]	cv_agg's multi_logloss: 2.39929 + 0.0225062
last mean: 2.3819529347976385
last stdv: 0.002575471097658755
min_data_in_leaf = 26
[100]	cv_agg's multi_logloss: 2.41997 + 0.0279787
[200]	cv_agg's multi_logloss: 2.4086 + 0.0314241
last mean: 2.3791775270888826
last stdv: 0.004678329123338282


min_data_in_leaf | loss-mean | loss-stdv
:-: | :-: | :-: 
10 | 2.398802236306051 | 0.008656683290247618
18 | 2.386981822669378 | 0.003385738976861327
19 | 2.395632364338063 | 0.002191965018621247
20 | **2.379316580570886** | **0.001614921368550909**
21 | 2.383561283453592 | 0.004011287921746836
22 | 2.472045824246667 | 0.010899544962079137
24 | 2.381952934797638 | 0.002575471097658755
25 | **2.375107782694854** | 0.004941879444657266
26 | **2.379177527088882** | 0.004678329123338282
28 | 2.424026943571746 | **0.001265288744226837**
30 | 2.383482452281396 | **0.001182071501585323**

更新params

In [17]:
params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,
    'max_depth': 6,
    'num_leaves': 51,
    'min_data_in_leaf' : 25,          #以上不再调整
    'feature_fraction': 0.8,          #常用数值，备调，step4
    'learning_rate': 0.1,             #默认数值，备调，step5
    }

Step4: feature_fraction

feature_fraction | loss-mean | loss-stdv
:-: | :-: | :-:
0.60 | 2.385215748483773 | 0.012141673640713
0.70 | 2.389174530713541 | 0.002666629435758
0.76 | 2.384927671009105 | 0.003871886355262
0.78 | 2.384927671009105 | 0.003871886355262
0.79 | 2.375107782694854 | 0.004941879444657
0.80 | 2.375107782694854 | 0.004941879444657
0.81 | 2.375107782694854 | 0.004941879444657
0.82 | 2.375107782694854 | 0.004941879444657
0.84 | 2.402827931259069 | 0.012815010197265
0.90 | 2.470756387332203 | 0.092283616674221

In [33]:
params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,
    'max_depth': 6,
    'num_leaves': 51,
    'min_data_in_leaf' : 25,
    'feature_fraction': 0.79,
    'learning_rate': 0.1,             #默认数值，备调，step5
    }

Step5: learning_rate

In [34]:
params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,                   #以上不再调整
    'max_depth': 6,
    'num_leaves': 51,                 #常用数值，备调，step2
    'min_data_in_leaf' : 25,          #默认数值，备调，step3
    'feature_fraction': 0.79,          #常用数值，备调，step4
    'learning_rate': 0.01,             #默认数值，备调，step5
    }
results = lgb.cv(params,
                 data_train,
                 num_boost_round = 10000,
                 nfold = 3,
                 shuffle = True,
                 early_stopping_rounds = 100,
                 verbose_eval = 50)
print('best num_boost_round:', len(results['multi_logloss-mean']))
print('last mean:', results['multi_logloss-mean'][-1])
print('last stdv:', results['multi_logloss-stdv'][-1])

[50]	cv_agg's multi_logloss: 2.57307 + 0.00056697
[100]	cv_agg's multi_logloss: 2.51931 + 0.000638326
[150]	cv_agg's multi_logloss: 2.48507 + 0.000721345
[200]	cv_agg's multi_logloss: 2.46164 + 0.00061995
[250]	cv_agg's multi_logloss: 2.44478 + 0.00061603
[300]	cv_agg's multi_logloss: 2.43207 + 0.000663855
[350]	cv_agg's multi_logloss: 2.42213 + 0.000679853
[400]	cv_agg's multi_logloss: 2.41438 + 0.000705551
[450]	cv_agg's multi_logloss: 2.40821 + 0.000819548
[500]	cv_agg's multi_logloss: 2.40324 + 0.00086405
[550]	cv_agg's multi_logloss: 2.39902 + 0.000895936
[600]	cv_agg's multi_logloss: 2.39527 + 0.000906648
[650]	cv_agg's multi_logloss: 2.39197 + 0.000893011
[700]	cv_agg's multi_logloss: 2.38895 + 0.000808116
[750]	cv_agg's multi_logloss: 2.38624 + 0.00080451
[800]	cv_agg's multi_logloss: 2.38366 + 0.000755893
[850]	cv_agg's multi_logloss: 2.38128 + 0.000807969
[900]	cv_agg's multi_logloss: 2.37916 + 0.000790989
[950]	cv_agg's multi_logloss: 2.37711 + 0.00076141
[1000]	cv_agg's mul

KeyboardInterrupt: 

In [2]:
params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,
    'max_depth': 6,
    'num_leaves': 51,
    'min_data_in_leaf' : 25,
    'feature_fraction': 0.79,
    'learning_rate': 0.01,
    }
gbm = lgb.train(params, data_train, num_boost_round = 5000)
gbm.save_model('../working/gbm(v2).txt')
test_features = np.load('./data/test_features.npy')
testResult = gbm.predict(test_features)
sampleSubmission = pd.read_csv('../input/sf-crime/sampleSubmission.csv.zip')
Result_pd = pd.DataFrame(testResult,
                         index=sampleSubmission.index,
                         columns=sampleSubmission.columns[1:])
Result_pd.to_csv('../working/sampleSubmission(gbmv2).csv', index_label='Id')