In [2]:
import numpy as np
import matplotlib.pyplot as plt
import json
import pandas as pd
%matplotlib inline

# Concatenation of TF-IDF weighed WordVecs and DocVecs

## TF-IDF weighed WordVecs

In [8]:
Text_wordvecs = np.loadtxt('./model/X.txt')

In [9]:
Text_wordvecs = Text_wordvecs[1:, 100:]
Text_wordvecs.shape

(6335, 200)

## DocVecs

In [5]:
from gensim.models import Doc2Vec
dv = Doc2Vec.load('./model/dv_text')
Doc_wordvecs = np.asarray(dv.docvecs)

## Concatenate them

In [10]:
X = np.column_stack((Text_wordvecs, Doc_wordvecs))\
y = np.loadtxt('./model/y.txt')

In [13]:
X.shape, y.shape

((6335, 400), (6335,))

train : test : validation = 6 : 2 : 2

In [68]:
from sklearn.model_selection import train_test_split
seed = 3
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, test_size=0.4, random_state=seed)
X_test, X_valid, y_test, y_valid = train_test_split(X_test_valid, y_test_valid,
                                                     test_size=0.5, random_state=seed)
print('Training set size: {}, testing set size: {}, validation set size: {}'.format(X_train.shape[0],
                                                                                   X_test.shape[0],
                                                                                   X_valid.shape[0]))

Training set size: 3801, testing set size: 1267, validation set size: 1267


## Use basic classifier and tune their parameters

In [21]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from scipy.stats import randint as sp_randint, uniform

In [69]:
svc = SVC()
rf = RandomForestClassifier()
mlp = MLPClassifier(hidden_layer_sizes=(50, 100))

In [18]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### 1) SVM

In [102]:
svc = SVC()

In [110]:
param = {
    'C': stats.uniform(1e-4, 10),
    'gamma': stats.uniform(0.002, 0.005)
}
n_iter_search = 50
seed = 3
scoring = {
    'accuracy': metrics.make_scorer(metrics.accuracy_score),
    'precision': metrics.make_scorer(metrics.precision_score),
    'recall': metrics.make_scorer(metrics.recall_score),
    'f1': metrics.make_scorer(metrics.f1_score)
}
random_search = RandomizedSearchCV(svc, param_distributions=param,
                                   n_iter=n_iter_search, return_train_score=True, random_state=seed,
                                scoring=scoring, refit='f1', n_jobs=4)
random_search.fit(X_valid, y_valid)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=5.508079025745754, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.005540739113090524,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=50, n_jobs=4,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002229AA67240>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002229AA67550>},
          pre_dispatch='2*n_jobs', random_state=3, refit='f1',
          return_train_score=True,
          scoring={'accuracy': make_scorer(accuracy_score), 'precision': make_scorer(precision_score), 'recall': make_scorer(recall_score), 'f1': make_scorer(f1_score)},
          verbose=0)

In [112]:
from operator import itemgetter
def report(grid_scores, n_top=5):
    # sort scores based on metric so we can grab the n_top models
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    # iterate over the n_top models
    for i in range(n_top):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              grid_scores['mean_test_score'][i],
              grid_scores['std_test_score'][i]))
        print("Parameters: {0}".format(grid_scores['params'][i]))
        print("")
report(random_search.cv_results_)

Model with rank: 1


KeyError: 'mean_test_score'

In [113]:
random_search.cv_results_

{'mean_fit_time': array([ 0.65796479,  1.05169725,  1.05017813,  1.0977122 ,  1.14495063,
         1.18200533,  1.12596854,  1.04374695,  1.06359529,  1.04174153,
         1.03673609,  1.03706964,  1.06058613,  1.06775673,  1.07376083,
         1.04974349,  1.17476678,  1.08521557,  1.05975024,  1.02906291,
         1.0389111 ,  1.05374742,  1.05858231,  1.06625644,  1.04981518,
         1.03990237,  1.03089801,  1.03973564,  1.05141346,  1.06325324,
         1.06742374,  1.07559474,  1.05258044,  0.9763755 ,  1.081146  ,
         1.22220008,  1.08343498,  1.07976405,  1.04807591,  1.03590266,
         1.04142237,  1.08532174,  1.03973961,  1.04774245,  1.04707829,
         1.06675879,  1.03640056,  1.06208507,  1.04107618,  1.01839073]),
 'mean_score_time': array([ 1.51123977,  1.8199176 ,  1.78826753,  1.90168516,  1.94037851,
         2.11316299,  1.71237961,  1.78005203,  1.78176483,  1.67418758,
         1.77159206,  1.59513903,  1.71355025,  1.63466024,  1.75324893,
         1.71

In [105]:
svc = SVC(C=5.5080790257457544, gamma=0.0055407391130905242).fit(X_train, y_train)
y_predict = svc.predict(X_test)

In [106]:
from sklearn import metrics
accuracy = metrics.accuracy_score(y_test, y_predict)
precision = metrics.precision_score(y_test, y_predict)
recall = metrics.recall_score(y_test, y_predict)
f1 = metrics.f1_score(y_test, y_predict)
print(accuracy, precision, recall, f1)

0.782951854775 0.942065491184 0.597444089457 0.731182795699


In [35]:
from scipy import stats

In [37]:
stats.uniform(0.1, 0.5)

<scipy.stats._distn_infrastructure.rv_frozen at 0x2229a775518>

array([  1.00000000e-02,   3.83118685e-02,   1.46779927e-01,
         5.62341325e-01,   2.15443469e+00,   8.25404185e+00,
         3.16227766e+01,   1.21152766e+02,   4.64158883e+02,
         1.77827941e+03,   6.81292069e+03,   2.61015722e+04,
         1.00000000e+05])

In [1]:
Data augementation

SyntaxError: invalid syntax (<ipython-input-1-ddbdf58eb61b>, line 1)