In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [3]:
data_x = np.load("./Doc2VecArray.npy")
data_x_pos = data_x[:100000]
data_x_neg = data_x[100000:]
data = np.append(data_x_pos,data_x_neg,axis=0)

print("data shape： ", data.shape)

label_list = ([1] * len(data_x_pos) + [0] * len(data_x_neg))
labels = np.array(label_list)
print("label shape： ", labels.shape)

train_x,val_x,train_y,val_y = train_test_split(data, labels, test_size=0.2)

print("training set shape： ", train_x.shape)
print("validation set shape： ", val_x.shape)

data shape：  (200000, 300)
label shape：  (200000,)
training set shape：  (160000, 300)
validation set shape：  (40000, 300)


In [27]:
from sklearn.linear_model import LogisticRegression as LR


grid_values = {'C': [1e-2,1e-1,1,2]}

clf = GridSearchCV(LR(penalty='l2', random_state=0, max_iter=3000,),
                           grid_values, scoring='roc_auc', cv=20,n_jobs=4)

clf.fit(train_x, train_y)
print("using LR, Best: %f using %s" %
      (clf.best_score_, clf.best_params_))

clf = clf.best_estimator_

pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

using LR, Best: 0.802397 using {'C': 1}
--- report ---
              precision    recall  f1-score   support

           0       0.73      0.74      0.73     20078
           1       0.73      0.72      0.72     19922

    accuracy                           0.73     40000
   macro avg       0.73      0.73      0.73     40000
weighted avg       0.73      0.73      0.73     40000

--- auc ---
0.7289031716172403


In [28]:
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn import preprocessing

def min_max_normalization(np_array):
    min_max_scaler = preprocessing.MinMaxScaler()
    ret = min_max_scaler.fit_transform(np_array)
    return ret

clf = MNB()
X_train = min_max_normalization(train_x)
clf.fit(X_train, train_y)
print("20 Fold CV Score for Multinomial Naive Bayes: %f" % (np.mean(cross_val_score
                                                                    (clf, X_train, train_y, cv=20, scoring='roc_auc'))))
pred_y = clf.predict(min_max_normalization(val_x))

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

20 Fold CV Score for Multinomial Naive Bayes: 0.749234
--- report ---
              precision    recall  f1-score   support

           0       0.69      0.67      0.68     20078
           1       0.68      0.70      0.69     19922

    accuracy                           0.69     40000
   macro avg       0.69      0.69      0.69     40000
weighted avg       0.69      0.69      0.69     40000

--- auc ---
0.6870984757678165


In [31]:
from sklearn.linear_model import SGDClassifier as SGD

# Find out which regularization parameter works the best.
sgd_params = {'alpha': [1e-1,0.5,1,1.5]}

clf = GridSearchCV(SGD(max_iter=10, random_state=0,loss='modified_huber',n_jobs=4),sgd_params, scoring='roc_auc', cv=20)  

clf.fit(train_x, train_y)
print("using SGD, Best: %f using %s" %
      (clf.best_score_, clf.best_params_))

clf = clf.best_estimator_

pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

using SGD, Best: 0.775810 using {'alpha': 0.1}
--- report ---
              precision    recall  f1-score   support

           0       0.70      0.74      0.72     20078
           1       0.72      0.68      0.70     19922

    accuracy                           0.71     40000
   macro avg       0.71      0.71      0.71     40000
weighted avg       0.71      0.71      0.71     40000

--- auc ---
0.7066859486932797


In [29]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=800,max_depth=20)
clf.fit(train_x, train_y)
pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

--- report ---
              precision    recall  f1-score   support

           0       0.73      0.72      0.73     20078
           1       0.72      0.73      0.73     19922

    accuracy                           0.73     40000
   macro avg       0.73      0.73      0.73     40000
weighted avg       0.73      0.73      0.73     40000

--- auc ---
0.7270013651907645


In [None]:
from sklearn.svm import SVC

parameters = [{'kernel': ['rbf'], 'gamma': [1e-2,0.005,1e-3],
                       'C': [0.5,1,1.5,2]},
                      {'kernel': ['linear'], 'C': [1e-2,0.1,1]}]
clf = GridSearchCV(
    SVC(probability=True),
    parameters,
    cv=5,
    scoring="roc_auc",
    n_jobs=4
)
clf.fit(train_x, train_y)
print("using SVM, Best: %f using %s" %
      (clf.best_score_, clf.best_params_))

clf = clf.best_estimator_

pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=800,max_depth=20)
clf.fit(train_x, train_y)
pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

In [None]:
import xgboost as xgb

cv_params = {'max_depth': [7,9,10], 'min_child_weight': [1, 3, 5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 700, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8,
              'objective': 'binary:logistic'}
clf = GridSearchCV(xgb.XGBClassifier(**ind_params),
                   cv_params,
                   scoring='roc_auc', cv=5, n_jobs=4,verbose=True)
clf.fit(train_x, train_y)

print("using sgdboot, Best: %f using %s" %
      (clf.best_score_, clf.best_params_))

clf = clf.best_estimator_

pred_y = clf.predict(val_x)

print('--- report ---')
print(classification_report(val_y, pred_y))

print('--- auc ---')
print(roc_auc_score(val_y, pred_y))

In [None]:
from keras.utils import to_categorical
from keras import layers
from keras import models

def build_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(300,)))
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

model_FC = build_model()

In [None]:
FC = model_FC.fit(train_x,
                    train_y,
                    epochs=30,
                    batch_size=512,
                    validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt

epochs = 30
loss = FC.history['loss']
val_loss = FC.history['val_loss']

plt.plot(range(epochs), loss, label="loss")
plt.plot(range(epochs), val_loss, label="val loss")
plt.legend()
plt.savefig('loss.png')