In [91]:
from text_embedder import TextEmbedder
from gensim import corpora, models
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import random
import pickle
import nltk
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from datetime import datetime
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [92]:
# load pre-trained data
business = pd.read_csv('../data/chinese_business_clean.csv')
reviews = pd.read_csv('../data/chinese_reviews_clean_offsets.csv')
reviews['date_tuple'] = [eval(i) for i in reviews['date_tuple']]

# load gensim model
lda =  models.LdaModel.load('../data/gensim/lda.model')
dictionary = corpora.Dictionary.load('../data/gensim/chinsese_dict.dict')

# load idf matrices
with open('../data/u_idf.pickle', 'rb') as f:
    uidf_data = pickle.load(f)
with open('../data/b_idf.pickle', 'rb') as f:
    bidf_data = pickle.load(f)

In [34]:
embedder = TextEmbedder(model = lda, dictionary = dictionary, user_idf = uidf_data, business_idf = bidf_data)

In [35]:
# label data, try to predict simple labels -- positive(1), negative(-1) or average(0)
def labels(offsets):
    if offsets < 0.0:
        return -1
    else:
        return int(offsets > 0.0)

In [430]:
# try basic methods with January of 2013 - 2017
def modified_split(df, year_month, embedder, t_size = 0.2, enum = 0, binary = False):
    # select regions
    data = df[df['date_tuple'] == year_month]
    if binary:
        data = data[data['offset'] != 0]
    # create labels
    label = data['offset'].values
    if enum == 0: 
        embed = np.array([embedder.augmented_embed_text(t) for t in data['text'].values])
    elif enum == 1: 
        embed = np.array([embedder.user_tfidf_embed(t, u) for t, u in zip(data['test'].values, data['user_id'].values)])
    elif enum == 2: 
        embed = np.array([embedder.user_tf_business_idf(t, b) for t in zip(data['test'].values, data['business_id'].values)])
    elif enum == 3: 
        embed = np.array([embedder.user_tfidf_business_idf(t, u, b) for t, u, b in zip(data['test'].values, data['user_id'].values, data['business_id'].values)])
    elif enum == 4: 
        embed = np.array([embedder.embed(t) for t in zip(data['test'].values)])
    elif enum == 5:
        embed = np.array([embedder.embed_sent(t) for t in zip(data['test'].values)])
    else:
        print ('enum {} is not supported'.format(enum))
        return None
    return embed, label

In [431]:
%time x0, y0 = modified_split(reviews, (2013, 1), embedder, 0)

CPU times: user 19.9 s, sys: 89.4 ms, total: 20 s
Wall time: 18.9 s


In [332]:
%time x1, y1 = modified_split(reviews, (2013, 1), embedder, 1)

CPU times: user 20 s, sys: 80 ms, total: 20 s
Wall time: 18.9 s


In [333]:
%time x2, y2 = modified_split(reviews, (2013, 1), embedder, 2)

CPU times: user 19.4 s, sys: 59.8 ms, total: 19.4 s
Wall time: 18.3 s


In [334]:
%time x3, y3 = modified_split(reviews, (2013, 1), embedder, 3)

CPU times: user 19 s, sys: 29.6 ms, total: 19 s
Wall time: 17.8 s


## Classification - xgb

In [93]:
labels_v = np.vectorize(labels)

In [456]:
def cmat_to_accuracy(mat):
    size = len(mat)
    total = sum(sum(mat))
    correct = 0
    for i in range(size):
        correct += mat[i,i]
    return correct/total

### Baseline - enbed

In [449]:
%time base_x, base_y = modified_split(reviews, (2013, 1), embedder, 4)

CPU times: user 19.8 s, sys: 86.5 ms, total: 19.9 s
Wall time: 18.8 s


In [450]:
X_train, X_test, y_train, y_test = train_test_split(base_x, base_y, test_size=0.2)

In [451]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [452]:
y_pred = model.predict(X_test)

In [453]:
confusion_matrix(y_train, model.predict(X_train))

array([[187, 106,  14],
       [  6, 481,  13],
       [  9, 100, 198]])

In [454]:
confusion_matrix(y_pred, labels_v(y_test))

array([[ 11,  22,  20],
       [ 33, 105,  53],
       [  7,  14,  14]])

In [457]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.46594982078853048

In [424]:
%time base_x_b, base_y_b = modified_split(reviews, (2013, 1), embedder, 4, binary = True)

CPU times: user 11.1 s, sys: 13.9 ms, total: 11.1 s
Wall time: 10.5 s


In [458]:
X_train, X_test, y_train, y_test = train_test_split(base_x_b, base_y_b, test_size=0.2)

In [459]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [460]:
y_pred = model.predict(X_test)

In [461]:
confusion_matrix(y_train, model.predict(X_train))

array([[253,  29],
       [ 20, 299]])

In [462]:
confusion_matrix(y_pred, labels_v(y_test))

array([[35, 28],
       [41, 47]])

In [463]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.54304635761589404

### Baseline - enbed_sent

In [433]:
%time base_s_x, base_s_y = modified_split(reviews, (2013, 1), embedder, 5)

CPU times: user 20 s, sys: 101 ms, total: 20.1 s
Wall time: 19 s


In [464]:
X_train, X_test, y_train, y_test = train_test_split(base_s_x, base_s_y, test_size=0.2)

In [465]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [466]:
y_pred = model.predict(X_test)

In [467]:
confusion_matrix(y_train, model.predict(X_train))

array([[144, 106,  26],
       [  3, 512,   9],
       [  3, 105, 206]])

In [468]:
confusion_matrix(y_pred, labels_v(y_test))

array([[ 4, 11,  3],
       [57, 93, 60],
       [21, 13, 17]])

In [469]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.40860215053763443

In [440]:
%time base_s_x_b, base_s_y_b = modified_split(reviews, (2013, 1), embedder, 5, binary = True)

CPU times: user 11.8 s, sys: 48.4 ms, total: 11.8 s
Wall time: 11.2 s


In [470]:
X_train, X_test, y_train, y_test = train_test_split(base_s_x_b, base_s_y_b, test_size=0.2)

In [471]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [472]:
y_pred = model.predict(X_test)

In [473]:
confusion_matrix(y_train, model.predict(X_train))

array([[261,  28],
       [ 24, 288]])

In [474]:
confusion_matrix(y_pred, labels_v(y_test))

array([[43, 28],
       [26, 54]])

In [475]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.64238410596026485

### Embedding 1

In [476]:
X_train, X_test, y_train, y_test = train_test_split(x0, y0, test_size=0.2)

In [477]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [478]:
y_pred = model.predict(X_test)

In [479]:
confusion_matrix(y_train, model.predict(X_train))

array([[173,  94,  13],
       [  7, 514,   2],
       [  3, 108, 200]])

In [480]:
confusion_matrix(y_pred, labels_v(y_test))

array([[16, 17, 16],
       [49, 84, 56],
       [13, 17, 11]])

In [481]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.39784946236559138

In [482]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [483]:
y_pred = model.predict(X_test)

In [484]:
confusion_matrix(y_train, model.predict(X_train))

array([[172,  95,  13],
       [  5, 512,   6],
       [  5, 104, 202]])

In [485]:
confusion_matrix(y_pred, labels_v(y_test))

array([[17, 12, 11],
       [48, 87, 56],
       [13, 19, 16]])

In [486]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.43010752688172044

### Embedding 2

In [487]:
X_train, X_test, y_train, y_test = train_test_split(x1, y1, test_size=0.2)

In [488]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [489]:
y_pred = model.predict(X_test)

In [490]:
confusion_matrix(y_train, model.predict(X_train))

array([[179,  92,  25],
       [  3, 483,   8],
       [  9, 105, 210]])

In [491]:
confusion_matrix(y_pred, labels_v(y_test))

array([[ 10,  17,  18],
       [ 41, 103,  37],
       [ 11,  27,  15]])

In [492]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.45878136200716846

In [493]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [494]:
y_pred = model.predict(X_test)

In [495]:
confusion_matrix(y_train, model.predict(X_train))

array([[197,  75,  24],
       [ 15, 469,  10],
       [ 11,  91, 222]])

In [496]:
confusion_matrix(y_pred, labels_v(y_test))

array([[ 18,  21,  14],
       [ 31, 104,  35],
       [ 13,  22,  21]])

In [497]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.51254480286738346

### Embedding 3

In [498]:
X_train, X_test, y_train, y_test = train_test_split(x2, y2, test_size=0.2)

In [499]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [500]:
y_pred = model.predict(X_test)

In [501]:
confusion_matrix(y_train, model.predict(X_train))

array([[175,  97,  11],
       [  1, 509,   8],
       [  9, 105, 199]])

In [502]:
confusion_matrix(y_pred, labels_v(y_test))

array([[13, 12, 11],
       [45, 87, 49],
       [17, 24, 21]])

In [503]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.43369175627240142

In [504]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [505]:
confusion_matrix(y_train, model.predict(X_train))

array([[171,  98,  14],
       [  5, 506,   7],
       [  7,  99, 207]])

In [506]:
confusion_matrix(y_pred, labels_v(y_test))

array([[13, 12, 11],
       [45, 87, 49],
       [17, 24, 21]])

In [507]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.43369175627240142

### Embedding 4

In [508]:
X_train, X_test, y_train, y_test = train_test_split(x3, y3, test_size=0.2)

In [509]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [510]:
y_pred = model.predict(X_test)

In [511]:
confusion_matrix(y_train, model.predict(X_train))

array([[158,  94,  13],
       [  4, 521,   4],
       [  6, 122, 192]])

In [512]:
confusion_matrix(y_pred, labels_v(y_test))

array([[13, 11,  8],
       [63, 92, 45],
       [17,  9, 21]])

In [513]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.45161290322580644

In [514]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [515]:
confusion_matrix(y_train, model.predict(X_train))

array([[157,  91,  17],
       [  5, 518,   6],
       [ 10, 108, 202]])

In [516]:
confusion_matrix(y_pred, labels_v(y_test))

array([[13, 11,  8],
       [63, 92, 45],
       [17,  9, 21]])

In [517]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.45161290322580644

## Classification with binary - xgb

In [360]:
%time x0_b, y0_b = modified_split(reviews, (2013, 1), embedder, 0, binary = True)

CPU times: user 11.7 s, sys: 49.6 ms, total: 11.8 s
Wall time: 11.2 s


In [361]:
%time x1_b, y1_b = modified_split(reviews, (2013, 1), embedder, 1, binary = True)

CPU times: user 11.4 s, sys: 24.6 ms, total: 11.5 s
Wall time: 10.9 s


In [362]:
%time x2_b, y2_b = modified_split(reviews, (2013, 1), embedder, 2, binary = True)

CPU times: user 11.2 s, sys: 18.5 ms, total: 11.2 s
Wall time: 10.6 s


In [363]:
%time x3_b, y3_b = modified_split(reviews, (2013, 1), embedder, 3, binary = True)

CPU times: user 11.6 s, sys: 33.6 ms, total: 11.6 s
Wall time: 11 s


### Embedding 1

In [518]:
X_train, X_test, y_train, y_test = train_test_split(x0_b, y0_b, test_size=0.2)

In [519]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [520]:
y_pred = model.predict(X_test)

In [521]:
confusion_matrix(y_train, model.predict(X_train))

array([[257,  28],
       [ 25, 291]])

In [522]:
confusion_matrix(y_pred, labels_v(y_test))

array([[44, 26],
       [29, 52]])

In [523]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.63576158940397354

In [524]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [525]:
y_pred = model.predict(X_test)

In [526]:
confusion_matrix(y_train, model.predict(X_train))

array([[258,  27],
       [ 33, 283]])

In [527]:
confusion_matrix(y_pred, labels_v(y_test))

array([[41, 28],
       [32, 50]])

In [528]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.60264900662251653

### Embedding 2

In [596]:
X_train, X_test, y_train, y_test = train_test_split(x1_b, y1_b, test_size=0.2)

In [597]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [598]:
y_pred = model.predict(X_test)

In [599]:
confusion_matrix(y_pred, labels_v(y_test))

array([[33, 34],
       [30, 54]])

In [600]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.57615894039735094

In [601]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [602]:
confusion_matrix(y_train, model.predict(X_train))

array([[262,  33],
       [ 33, 273]])

In [603]:
confusion_matrix(y_pred, labels_v(y_test))

array([[33, 34],
       [30, 54]])

In [604]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.57615894039735094

### Embedding 3

In [605]:
X_train, X_test, y_train, y_test = train_test_split(x2_b, y2_b, test_size=0.2)

In [606]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [607]:
y_pred = model.predict(X_test)

In [608]:
confusion_matrix(y_train, model.predict(X_train))

array([[255,  29],
       [ 26, 291]])

In [609]:
confusion_matrix(y_pred, labels_v(y_test))

array([[36, 29],
       [38, 48]])

In [610]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.55629139072847678

In [611]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [612]:
y_pred = model.predict(X_test)

In [613]:
confusion_matrix(y_train, model.predict(X_train))

array([[245,  39],
       [ 25, 292]])

In [614]:
confusion_matrix(y_pred, labels_v(y_test))

array([[41, 29],
       [33, 48]])

In [615]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.58940397350993379

### Embedding 4

In [616]:
X_train, X_test, y_train, y_test = train_test_split(x3_b, y3_b, test_size=0.2)

In [617]:
y_train = labels_v(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [618]:
y_pred = model.predict(X_test)

In [619]:
confusion_matrix(y_train, model.predict(X_train))

array([[264,  27],
       [ 26, 284]])

In [620]:
confusion_matrix(y_pred, labels_v(y_test))

array([[41, 32],
       [26, 52]])

In [621]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.61589403973509937

In [622]:
y_train = labels_v(y_train)
model = XGBClassifier(subsample = 0.5, colsample_bytree = 0.5)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5)

In [623]:
y_pred = model.predict(X_test)

In [624]:
confusion_matrix(y_train, model.predict(X_train))

array([[254,  37],
       [ 28, 282]])

In [625]:
confusion_matrix(y_pred, labels_v(y_test))

array([[40, 32],
       [27, 52]])

In [626]:
cmat_to_accuracy(confusion_matrix(y_pred, labels_v(y_test)))

0.60927152317880795

## Comparison Result

3 labels  

| Embedding     | Acuracy       | 
| ------------- | ------------- | 
| random guess  | 0.33          | 
| Embed         |  0.4659       |   
| Embed_sent    | 0.408         |
| Embed 1       | 0.4301        | 
| Embed 2       |  **0.5125**   |   
| Embed 3       | 0.4337        |
| Embed 4       | 0.4516        | 

2 labels  

| Embedding     | Acuracy       | 
| ------------- | ------------- | 
| random guess  | 0.50          | 
| Embed         |  0.5430       |   
| Embed_sent    | **0.6424**     |
| Embed 1       | 0.6358        | 
| Embed 2       |  0.5761   |   
| Embed 3       | 0.5894        |
| Embed 4       | 0.6159        | 