In [1]:
import os, math
import numpy as np
import pandas as pd
import seaborn as sns
import helpers
import xgboost as xgb

%matplotlib inline
import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn import svm
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import log_loss


In [2]:
y = pd.read_csv('./data/training_variants')
X = pd.read_csv('./data/training_text', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
text_train, text_test, variants_train, variants_test = train_test_split(X, y, test_size=0.2, \
                                                                        random_state=0, stratify=None)

train_full = variants_train.merge(text_train, how='inner', on='ID')
test_full = variants_test.merge(text_test, how='inner', on='ID')

print(train_full.shape)
print(test_full.shape)

  X = pd.read_csv('./data/training_text', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])


(2656, 5)
(665, 5)


Use TF-IDF to Vectorize the texts， a feature selector, then SVM one-vs-all classification. 

In [4]:
start_time = time.time()

##Pipeline
tfidf = TfidfVectorizer(
    min_df=1, max_features=16000, strip_accents='unicode',lowercase =True,
    analyzer='word', use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words = 'english')
ffilter = SelectKBest(mutual_info_classif, k=3000)
clf = OneVsRestClassifier(svm.SVC(C=1.0, kernel='linear', probability=True,
                                 random_state=0))
##Data and labels
y_train = train_full["Class"]
X_train = ffilter.fit_transform(tfidf.fit_transform(train_full["Text"]), y_train)

y_test = test_full["Class"]
X_test = ffilter.transform(tfidf.transform(test_full["Text"]))

y_train_bi = label_binarize(train_full["Class"], classes=range(1, 10))
y_test_bi = label_binarize(test_full["Class"], classes=range(1, 10))

print("--- %s seconds ---" % (time.time() - start_time))

--- 86.05518007278442 seconds ---


In [5]:
params = {
        'eta': 0.03333,
        'max_depth': 4,
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 9,
        'seed': 0,
        'silent': True
    }

watchlist = [(xgb.DMatrix(X_train, y_train-1), 'train'), (xgb.DMatrix(X_test, y_test-1), 'valid')]
model = xgb.train(params, xgb.DMatrix(X_train, y_train-1), 1000,  watchlist, verbose_eval=50, early_stopping_rounds=100)

XGBoostError: b'[16:43:41] src/objective/multiclass_obj.cc:75: Check failed: label_error >= 0 && label_error < nclass SoftmaxMultiClassObj: label must be in [0, num_class), num_class=9 but found 9 in label.\n\nStack trace returned 2 entries:\n[bt] (0) 0   libxgboost.dylib                    0x000000011b921939 _ZN4dmlc15LogMessageFatalD1Ev + 41\n[bt] (1) 1   libstdc++.6.dylib                   0x000000011bc75fc0 _ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE + 16\n'

In [None]:
#Evaluate
y_test_prob = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit), labels = list(range(9))
log_loss(y_test, y_test_prob, eps=1e-15, normalize=True, labels=range(1, 10))

In [None]:
helpers.plot_roc_curve(y_test_bi, y_test_prob)

In [None]:
start_time = time.time()

X_submit = ffilter.transform(tfidf.transform(pd.read_csv('./data/test_text', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])["Text"])
y_submit_prob = model.predict(xgb.DMatrix(X_submit), ntree_limit=model.best_ntree_limit), labels = list(range(9))

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
with open('./data/submission_xgb.csv', 'w') as f:
    f.write('ID,class1,class2,class3,class4,class5,class6,class7,class8,class9\n')
    for i in range(y_submit_prob.shape[0]):
        f.write(str(i)+',')
        for j in range(y_submit_prob.shape[1]):
            f.write(str(y_submit_prob[i][j]))
            if j<8:
                f.write(',')
        f.write('\n')