In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import gensim
from operator import itemgetter
from scipy import stats
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
%matplotlib inline
%load_ext rpy2.ipython



Using the hypertuned Doc2Vec model, five classifiers are used here－SVM, Random Forests, XGBoost, Multiple-Layer Perceptron and Logistic Regression. Each of the five models is tuned independently. The final label of the test data is determined by voting among the five models.

In [35]:
def report_classify_result(y_pred, y_test):
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    print('Accuracy: {:.3f} | Precision: {:.3f} | Recall: {:.3f} | F1: {:.3f}'.format(accuracy,
                                                                                     precision,
                                                                                     recall,
                                                                                     f1))
def vote(votes):
    if np.sum(votes) >= 0.5:
        return 1
    return 0

def report(grid_scores, n_top=5):
    # sort scores based on metric so we can grab the n_top models
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    # iterate over the n_top models
    for i in range(n_top):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              grid_scores['mean_test_score'][i],
              grid_scores['std_test_score'][i]))
        print("Parameters: {0}".format(grid_scores['params'][i]))
        print("")

In [7]:
with open('./model/cleaned_df.json', 'r', encoding='utf8') as f:
    cleaned_df = json.loads(f.read())
y = np.loadtxt('./model/y.txt')
cleaned_df = pd.DataFrame(cleaned_df)

In [8]:
## Load the trained model
from gensim.models import Doc2Vec
dv_text = Doc2Vec.load('./model/dv_text')
dv_title = Doc2Vec.load('./model/dv_title')

In [9]:
X = np.column_stack((dv_title.docvecs, dv_text.docvecs))
X.shape

(6335, 300)

In [10]:
from sklearn.model_selection import train_test_split
seed = 3
X_train, X_test_valid, y_train, y_test_valid = train_test_split(X, y, test_size=0.3, random_state=seed)
X_test, X_valid, y_test, y_valid = train_test_split(X_test_valid, y_test_valid,
                                                    test_size=0.33, random_state=seed)

### SVM

In [23]:
from sklearn.svm import SVC
svc = SVC()

In [24]:
param = {
    'C': 10.0 ** np.arange(-2, 9),
    'gamma': 10.0 ** np.arange(-5, 7)
}
random_search = GridSearchCV(svc, param_grid=param, return_train_score=True,
                             n_jobs=4, scoring=metrics.make_scorer(metrics.f1_score))
random_search.fit(X_valid, y_valid)

KeyboardInterrupt: 

In [65]:
print(random_search.best_estimator_)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [25]:
best_params = {'C': 1, 'gamma': 0.001}
svc.set_params(**best_params)
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

In [26]:
report_classify_result(y_pred_svc, y_test)

Accuracy: 0.879 | Precision: 0.857 | Recall: 0.906 | F1: 0.881


### XGBoost

In [27]:
from xgboost import XGBClassifier
XGBmodel = XGBClassifier(booster='gbtree', n_jobs=2, objective='binary:logistic', learning_rate = 0.05, 
                         max_depth = 3, min_child_weight = 2, n_estimators = 3700)
XGBmodel.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=2, missing=None, n_estimators=3700,
       n_jobs=2, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [28]:
y_pred_xbg = XGBmodel.predict(X_test)
report_classify_result(y_pred_xbg, y_test)

Accuracy: 0.905 | Precision: 0.897 | Recall: 0.912 | F1: 0.904


### MLP

In [29]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=500)

In [76]:
param = {
    'hidden_layer_sizes': [(800, 1000), (1000, 1200)],
    'alpha': np.logspace(-4, 0, 5)
}
random_search = GridSearchCV(mlp, param_grid=param, return_train_score=True,
                             n_jobs=4, scoring=metrics.make_scorer(metrics.f1_score))
random_search.fit(X_valid, y_valid)

GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'hidden_layer_sizes': [(800, 1000), (1000, 1200)], 'alpha': array([  1.00000e-04,   1.00000e-03,   1.00000e-02,   1.00000e-01,
         1.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [78]:
print(random_search.best_estimator_)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(800, 1000), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)


In [30]:
best_params = {
    'alpha': 0.0001,
    'hidden_layer_sizes': (800, 1000)
}
mlp.set_params(**best_params)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)

In [31]:
report_classify_result(y_pred_mlp, y_test)

Accuracy: 0.906 | Precision: 0.894 | Recall: 0.917 | F1: 0.906


### Random Forests

In [3]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [15]:
param = {
    'n_estimators': stats.randint(5, 50),
    'max_depth': stats.randint(1, 8),
    'max_features': stats.randint(1, 10)
}
n_iter_search = 100
seed = 3
random_search = RandomizedSearchCV(rfc, param_distributions=param,
                                   n_iter=n_iter_search, return_train_score=True, random_state=seed,
                                   scoring=metrics.make_scorer(metrics.f1_score), n_jobs=4)
random_search.fit(X_valid, y_valid)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=4,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E10CE79940>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E10CE79240>, 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001E10CC70828>},
          pre_dispatch='2*n_jobs', random_state=3, refit=True,
          return_train_score=True, scoring=make_scorer(f1_score),
          verbos

In [16]:
print(random_search.best_estimator_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=44, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [17]:
best_params = {
    'max_depth': 6,
    'max_features': 6,
    'n_estimators': 44
}
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
report_classify_result(y_pred_rfc, y_test)

Accuracy: 0.797 | Precision: 0.831 | Recall: 0.738 | F1: 0.782


### Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [20]:
param = {
    'C': np.logspace(-4, 4, 9, base=2)
}
grid_search = GridSearchCV(lr, param_grid=param, return_train_score=True,
                           scoring=metrics.make_scorer(metrics.f1_score), n_jobs=4)
grid_search.fit(X_valid, y_valid)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'C': array([  0.0625,   0.125 ,   0.25  ,   0.5   ,   1.    ,   2.    ,
         4.    ,   8.    ,  16.    ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [21]:
print(grid_search.best_estimator_)

LogisticRegression(C=0.0625, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)


In [22]:
best_params = {
    'C': 0.0625
}
lr.set_params(**best_params)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
report_classify_result(y_pred_lr, y_test)

Accuracy: 0.889 | Precision: 0.877 | Recall: 0.901 | F1: 0.889


## Vote

In [37]:
weights = np.asarray([0.889, 0.906, 0.782, 0.881, 0.904])
weights = weights / np.sum(weights)
weights

array([ 0.20380559,  0.20770289,  0.17927556,  0.20197157,  0.20724438])

In [38]:
Y_pred = np.asarray([0.20380559*y_pred_lr, 0.20770289*y_pred_mlp,
                     0.17927556*y_pred_rfc, 0.20197157*y_pred_svc, 0.20724438*y_pred_xbg])
Y_pred.shape

(5, 1273)

In [39]:
y_pred = np.zeros(Y_pred.shape[1])
for i in range(Y_pred.shape[1]):
    y_pred[i] = vote(Y_pred[:, i])

In [40]:
report_classify_result(y_pred, y_test)

Accuracy: 0.909 | Precision: 0.905 | Recall: 0.911 | F1: 0.908
