<b> Coding by python2.7
   
authors: Qixiang PENG, Zizhao LI

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
%matplotlib inline



<b>load data

In [2]:
train = pd.read_csv('./data/train.csv')
# class between [1,9]
train_y = train['target'].apply(lambda s: int(s[-1:])).values
train_X = train.drop('id', axis=1)
train_X = train_X.drop('target', axis=1)
X_test = pd.read_csv('./data/test.csv')
X_test = X_test.drop('id', axis=1)

<b>use random forest

In [3]:
# split train set into 2 parts with same distribution: 80% train, 20% validation
sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss1.split(train_X.values, train_y):
    X_train = train_X.values[train_index]
    X_val = train_X.values[test_index]

    y_train = train_y[train_index]
    y_val = train_y[test_index]

In [4]:
rf = RandomForestClassifier(n_estimators=900, max_depth=50, max_features=0.3, n_jobs=-1)
rf.fit(X_train, y_train)

carf = CalibratedClassifierCV(rf, method="isotonic", cv=5)
carf.fit(X_train, y_train)
pred1 = carf.predict_proba(X_val)
score = log_loss(y_val, pred1)
print "The log loss of random forest is: " + str(score)

The log loss of random forest is: 0.485162164708


<b>use xgboost

In [5]:
# split train set into 2 parts with same distribution: 80% train, 20% validation
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss2.split(train_X.values, train_y):
    X_train = train_X.values[train_index]
    X_val = train_X.values[test_index]

    y_train = train_y[train_index]
    y_val = train_y[test_index]

In [6]:
xgb = XGBClassifier(max_depth=10, learning_rate=0.0825, subsample=0.85, colsample_bytree=0.8, 
                             min_child_weight=5.2475, objective='multi:softprob')
xgb.fit(X_train, y_train)
pred2 = xgb.predict_proba(X_val)
score = log_loss(y_val, pred2)
print "The log loss of xgb is: " + str(score)

The log loss of xgb is: 0.49081880946


<b>use extra-tree

In [7]:
# split train set into 2 parts with same distribution: 80% train, 20% validation
sss3 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in sss3.split(train_X.values, train_y):
    X_train = train_X.values[train_index]
    X_val = train_X.values[test_index]

    y_train = train_y[train_index]
    y_val = train_y[test_index]

In [8]:
et = ExtraTreesClassifier(n_estimators=1000, max_depth=80, max_features=0.7, n_jobs=-1)
et.fit(X_train, y_train)
caet = CalibratedClassifierCV(et, method="isotonic", cv=5)
caet.fit(X_train, y_train)
pred3 = caet.predict_proba(X_val)
score = log_loss(y_val, pred3)
print "The log loss extra-tree is: " + str(score)

The log loss extra-tree is: 0.471821131449


<b>bagging

In [9]:
pred = (pred1 + pred1 + pred2) * 1.0 / 3
score = log_loss(y_val, pred)
print "The log loss bagging is: " + str(score)

The log loss bagging is: 0.473403001831


<b> expot

In [11]:
result1 = carf.predict_proba(X_test)
result2 = xgb.predict_proba(X_test.values)
result3 = caet.predict_proba(X_test)
result = (result1 + result2 + result3) * 1.0 / 3 
columns = ["id","Class_1","Class_2","Class_3","Class_4","Class_5","Class_6","Class_7","Class_8","Class_9"]
rlt = np.zeros(result.shape[0] * 9).reshape((result.shape[0],9)).astype(float)

i = 0
for class_i in result:
    rlt[i] = class_i
    i += 1
    
r = []
i = 1
for class_i in rlt:
    p = [i] + list(map(str, class_i.tolist()))
    i += 1
    r.append(p)
r = np.array(r)

out = pd.DataFrame(r,columns = columns)
out.to_csv('result_Bagging.csv',index = False)