In [3]:
import os, math
import numpy as np
import pandas as pd
import seaborn as sns
import helpers

%matplotlib inline
import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

from sklearn import svm
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2, f_classif
from sklearn.metrics import log_loss

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV


In [4]:
text, variants = helpers.get_training('./input/training_variants', './input/training_text')
#train_full, val_full = train_test_split(variants.merge(text, how='inner', on='ID'))
train_full = variants.merge(text, how='inner', on='ID')

#this is 20% of the labeled data
text, variants = helpers.get_test('./input/training_variants', './input/training_text')
test_full = variants.merge(text, how='inner', on='ID')

print(train_full.shape)
#print(val_full.shape)
print(test_full.shape)

(2656, 5)
(665, 5)


Use TF-IDF to Vectorize the texts， a feature selector, then SVM one-vs-all classification. 

In [5]:
start_time = time.time()

##Pipeline
tfidf = TfidfVectorizer(
    min_df=1, max_features=16000, strip_accents='unicode',lowercase =True,
    analyzer='word', use_idf=True, 
    smooth_idf=True, sublinear_tf=True, stop_words = 'english')
ffilter = SelectKBest(mutual_info_classif, k=500)
#ffilter = SelectKBest(chi2, k=500)
#ffilter = SelectKBest(f_classif, k=500)
#ffilter = TruncatedSVD(n_components=100)
#ffilter = LinearDiscriminantAnalysis(n_components=100)

##Data and labels
y_train = train_full["Class"]-1
X_train = ffilter.fit_transform(tfidf.fit_transform(train_full["Text"]), y_train)

#y_val = val_full["Class"]-1
#X_val = ffilter.transform(tfidf.transform(val_full["Text"]))

y_test = test_full["Class"]-1
X_test = ffilter.transform(tfidf.transform(test_full["Text"]))

y_train_bi = label_binarize(y_train, classes=range(9))
#y_val_bi = label_binarize(y_val, classes=range(9))
y_test_bi = label_binarize(y_test, classes=range(9))


In [7]:
##Fitting
parameters = {
    "estimator__C": [10],
    "estimator__kernel": ['linear']
    #"estimator__degree": [2, 3]
}

clf = GridSearchCV(OneVsRestClassifier(svm.SVC(probability=True, class_weight='balanced')), param_grid=parameters, scoring='neg_log_loss', n_jobs=-1)
clf.fit(X_train, y_train_bi)

GridSearchCV(cv=None, error_score='raise',
       estimator=OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'estimator__C': [10], 'estimator__kernel': ['linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [None]:
pd.DataFrame.from_dict(clf.cv_results_)

In [None]:
#Evaluate
print(log_loss(y_train, clf.predict_proba(X_train), eps=1e-15, normalize=True, labels=range(9)))
#print(log_loss(y_val, clf.predict_proba(X_val), eps=1e-15, normalize=True, labels=range(9)))
print(log_loss(y_test, clf.predict_proba(X_test), eps=1e-15, normalize=True, labels=range(9)))

In [None]:
helpers.plot_roc_curve(y_test_bi, y_test_prob)

In [None]:
start_time = time.time()


X_submit = ffilter.transform(tfidf.transform(\
    pd.read_csv('./data/test_text', sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])["Text"]))

y_submit_prob = clf.predict_proba(X_submit) 

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
with open('./data/submission.csv', 'w') as f:
    f.write('ID,class1,class2,class3,class4,class5,class6,class7,class8,class9\n')
    for i in range(y_submit_prob.shape[0]):
        f.write(str(i)+',')
        for j in range(y_submit_prob.shape[1]):
            f.write(str(y_submit_prob[i][j]))
            if j<8:
                f.write(',')
        f.write('\n')

In [None]:
Xtr_test = tfidf.transform(X_test)

X_train_tfidf = np.array(ffilter.transform(Xtr_train).todense())
X_test_tfidf = np.array(ffilter.transform(Xtr_test).todense())

In [None]:
np.save('./data/X_train_tfidf', X_train_tfidf)
np.save('./data/X_test_tfidf', X_test_tfidf)
np.save('./data/y_train_tfidf', y_train_bi)
np.save('./data/y_test_tfidf', y_test_bi)