In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


In [2]:
train_set = np.load('/home/fesian/contest_workspace/chunyu/dataset/features/resnet18_512d/train_wo_eval/features_mat.npy',allow_pickle=True).item()
test_set = np.load('/home/fesian/contest_workspace/chunyu/dataset/features/resnet18_512d/test_wo_eval/features_mat.npy',allow_pickle=True).item()

In [3]:
train_index = np.load('/home/fesian/contest_workspace/chunyu/dataset/ctc_labels/index_files/train_index.npy', allow_pickle=True)
test_index = np.load('/home/fesian/contest_workspace/chunyu/dataset/ctc_labels/index_files/test_index.npy', allow_pickle=True)
eval_index = np.load('/home/fesian/contest_workspace/chunyu/dataset/ctc_labels/index_files/eval_index.npy', allow_pickle=True)

In [11]:
traindata = np.zeros(shape=(len(train_index), 512))
trainlabel = np.zeros(shape=(len(train_index)))
for ind, each in enumerate(train_index):
    label = each[1]
    hashname = each[0]
    mat = train_set[hashname][0]
    traindata[ind, :] = mat
    trainlabel[ind] = label

evaldata = np.zeros(shape=(len(eval_index), 512))
evallabel = np.zeros(shape=(len(eval_index)))
for ind, each in enumerate(eval_index):
    label = each[1]
    hashname = each[0]
    mat = train_set[hashname][0]
    evaldata[ind, :] = mat
    evallabel[ind] = label
    
testdata = np.zeros(shape=(len(test_index), 512))
for ind, each in enumerate(test_index):
    hashname = each
    mat = test_set[hashname][0]
    testdata[ind, :] = mat

In [12]:
dtrain = xgb.DMatrix(traindata, label=trainlabel)
deval = xgb.DMatrix(evaldata, label=evallabel)
dtest = xgb.DMatrix(testdata, label=testlabel)

In [6]:
accs = []
depth = 6
dtrain = xgb.DMatrix(traindata, label=trainlabel)
deval = xgb.DMatrix(evaldata, label=evallabel)
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',  # 多分类的问题
    'num_class': 313,               # 类别数，与 multisoftmax 并用
    'gamma': 0.2,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': depth,               # 构建树的深度，越大越容易过拟合
    'lambda': 10,                   # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.5,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 5,
    'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.01,                  # 如同学习率
    'seed': 200,
    'nthread': 4,                  # cpu 线程数
}
params['gpu_id'] = 0
params['max_bin'] = 25
params['tree_method'] = 'gpu_hist'
watchlist = [(deval, 'eval'), (dtrain, 'train')]
n_round = 120
booster = xgb.train(params, dtrain, num_boost_round=n_round, evals=watchlist)
y_predicted = booster.predict(deval)
acc = np.sum(y_predicted == evallabel)/len(evallabel)
accs += [(acc, depth)]

[0]	eval-merror:0.987	train-merror:0.9831
[1]	eval-merror:0.974	train-merror:0.940516
[2]	eval-merror:0.923	train-merror:0.839893
[3]	eval-merror:0.845	train-merror:0.72515
[4]	eval-merror:0.819	train-merror:0.631977
[5]	eval-merror:0.793	train-merror:0.580387
[6]	eval-merror:0.761	train-merror:0.52724
[7]	eval-merror:0.746	train-merror:0.460084
[8]	eval-merror:0.714	train-merror:0.388259
[9]	eval-merror:0.696	train-merror:0.352791
[10]	eval-merror:0.678	train-merror:0.296976
[11]	eval-merror:0.644	train-merror:0.258172
[12]	eval-merror:0.608	train-merror:0.236602
[13]	eval-merror:0.598	train-merror:0.211808
[14]	eval-merror:0.578	train-merror:0.183344
[15]	eval-merror:0.56	train-merror:0.171114
[16]	eval-merror:0.555	train-merror:0.152657
[17]	eval-merror:0.555	train-merror:0.13598
[18]	eval-merror:0.543	train-merror:0.129197
[19]	eval-merror:0.527	train-merror:0.12386
[20]	eval-merror:0.515	train-merror:0.124416
[21]	eval-merror:0.517	train-merror:0.110518
[22]	eval-merror:0.506	trai

In [36]:
y_predicted = booster.predict(deval)
acc = np.sum(y_predicted == evallabel)/len(evallabel)

In [37]:
acc

0.6

In [25]:
codebook = np.load('/home/fesian/contest_workspace/chunyu/dataset/ctc_labels/word_codebook.npy', allow_pickle=True).item()
codebook = {v:k for k,v in codebook.items()}

In [15]:
pred = booster.predict(dtest)

In [29]:
filecsv = []
for ind, p in zip(test_index, pred):
    filecsv += [(ind, codebook[int(p)])]

In [31]:
filecsv = np.array(filecsv)
filecsv = pd.DataFrame(filecsv)

In [34]:
filecsv.to_csv("./predict_ensemble_2.csv", header=False, 
                                                index_label=False, 
                                                index=False)