<b> Coding by python2.7
   
authors: Qixiang PENG, Zizhao LI

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV  

<b>lode data set

In [15]:
train = pd.read_csv('./data/train.csv')
# class between [1,9]
train_y = train['target'].apply(lambda s: int(s[-1:])).values
train_X = train.drop('id', axis=1)
train_X = train_X.drop('target', axis=1)
X_test = pd.read_csv('./data/test.csv')
X_test = X_test.drop('id', axis=1).values
# split train set into 2 parts with same distribution: 80% train, 20% validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(train_X.values, train_y):
    X_train = train_X.values[train_index]
    X_val = train_X.values[test_index]

    y_train = train_y[train_index]
    y_val = train_y[test_index]

<b>original xgboost model

In [6]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
pred = xgb.predict_proba(X_val)
score = log_loss(y_val, pred)
print "The log loss is: " + str(score)

The log loss is: 0.641349608103


<b> another xgboost

In [10]:
xgb_original = XGBClassifier(max_depth=10, learning_rate=0.0825, subsample=0.85, colsample_bytree=0.8, 
                             min_child_weight=5.2475, objective='multi:softprob')
xgb_original.fit(X_train, y_train)
pred = xgb_original.predict_proba(X_val)
score = log_loss(y_val, pred)
print "The log loss is: " + str(score)

The log loss is: 0.49081880946


<b>fine-tune hyper-parameters of xgb

In [113]:
# the default setting is

# XGBClassifier(
#  learning_rate =0.1,
#  n_estimators=1000,
#  max_depth=5,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.8,
#  colsample_bytree=0.8,
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27)

In [3]:
# firstly we set learning-rate as 0.1, and search the best n_estimator
param_test1 = {
 'n_estimators':range(100,1200,200)
 }
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, max_depth=5,
min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27), 
param_grid = param_test1, scoring='neg_log_loss',n_jobs=4,iid=False, cv=3)
gsearch1.fit(train_X,train_y)

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=4,
       param_grid={'n_estimators': [100, 300, 500, 700, 900, 1100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [7]:
print gsearch1.best_params_
print gsearch1.best_score_
print gsearch1.grid_scores_

{'n_estimators': 700}
-0.47375621873
[mean: -0.56585, std: 0.00399, params: {'n_estimators': 100}, mean: -0.49265, std: 0.00492, params: {'n_estimators': 300}, mean: -0.47698, std: 0.00545, params: {'n_estimators': 500}, mean: -0.47376, std: 0.00546, params: {'n_estimators': 700}, mean: -0.47718, std: 0.00476, params: {'n_estimators': 900}, mean: -0.48339, std: 0.00481, params: {'n_estimators': 1100}]




In [6]:
# then, fix learning-rate as 0.1, n_estimator = 700(if we have time, we can fine-tune it more precisely), fine-tune max_depth and min_weight
param_test2 = {
 'max_depth':range(7,14,2),
 'min_child_weight':range(1,6,2)
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=700, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='neg_log_loss',n_jobs=-1,iid=False, cv=3)
gsearch2.fit(train_X,train_y)

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=700, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'max_depth': [7, 9, 11, 13], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [9]:
print gsearch2.best_params_
print gsearch2.grid_scores_

{'max_depth': 7, 'min_child_weight': 5}
[mean: -0.48923, std: 0.00547, params: {'max_depth': 7, 'min_child_weight': 1}, mean: -0.47768, std: 0.00529, params: {'max_depth': 7, 'min_child_weight': 3}, mean: -0.47311, std: 0.00620, params: {'max_depth': 7, 'min_child_weight': 5}, mean: -0.52020, std: 0.00630, params: {'max_depth': 9, 'min_child_weight': 1}, mean: -0.49668, std: 0.00543, params: {'max_depth': 9, 'min_child_weight': 3}, mean: -0.48558, std: 0.00643, params: {'max_depth': 9, 'min_child_weight': 5}, mean: -0.54164, std: 0.00535, params: {'max_depth': 11, 'min_child_weight': 1}, mean: -0.51054, std: 0.00599, params: {'max_depth': 11, 'min_child_weight': 3}, mean: -0.49757, std: 0.00608, params: {'max_depth': 11, 'min_child_weight': 5}, mean: -0.55157, std: 0.00511, params: {'max_depth': 13, 'min_child_weight': 1}, mean: -0.51923, std: 0.00503, params: {'max_depth': 13, 'min_child_weight': 3}, mean: -0.50647, std: 0.00610, params: {'max_depth': 13, 'min_child_weight': 5}]




In [18]:
xgb_optimal = XGBClassifier(learning_rate =0.1, n_estimators=700, gamma=0, max_depth=7, min_child_weight=3, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=1, seed=27, objective='multi:softprob')
xgb_optimal.fit(X_train, y_train)
pred = xgb_optimal.predict_proba(X_val)
score = log_loss(y_val, pred)
print "The log loss is: " + str(score)

The log loss is: 0.444085652611


<b>export

In [21]:
result = xgb_optimal.predict_proba(X_test)
columns = ["id","Class_1","Class_2","Class_3","Class_4","Class_5","Class_6","Class_7","Class_8","Class_9"]
rlt = np.zeros(result.shape[0] * 9).reshape((result.shape[0],9)).astype(float)

i = 0
for class_i in result:
    rlt[i] = class_i
    i += 1
    
r = []
i = 1
for class_i in rlt:
    p = [i] + list(map(str, class_i.tolist()))
    i += 1
    r.append(p)
r = np.array(r)

out = pd.DataFrame(r,columns = columns)
out.to_csv('result_Xgboost.csv',index = False)