In [1]:
import numpy as np
import os
import scipy
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, plot_roc_curve
from imblearn.over_sampling import SMOTE

plt.style.use('seaborn')
%matplotlib inline

In [2]:
PATH = "/Users/mazeyu/Desktop/CMU/20fall/18797/project/features"

In [3]:
train_x = np.load(os.path.join(PATH, 'train_x.npy'), allow_pickle=True)
train_y = np.load(os.path.join(PATH, 'train_y.npy'), allow_pickle=True)
dev1_x = np.load(os.path.join(PATH, 'd1_x.npy'), allow_pickle=True)
dev1_y = np.load(os.path.join(PATH, 'd1_y.npy'), allow_pickle=True)
dev2_x = np.load(os.path.join(PATH, 'd2_x.npy'), allow_pickle=True)
dev2_y = np.load(os.path.join(PATH, 'd2_y.npy'), allow_pickle=True)
test_x = np.load(os.path.join(PATH, 'test_x.npy'), allow_pickle=True)
test_y = np.load(os.path.join(PATH, 'test_y.npy'), allow_pickle=True)

In [None]:
train_x = np.concatenate([train_x, dev1_x, dev2_x])
train_y = np.concatenate([train_y, dev1_y, dev2_y])

smote = SMOTE(random_state=0)
balance_x, balance_y = smote.fit_resample(train_x, train_y)

In [5]:
print('train:', train_x.shape, train_y.shape)
print('test:', test_x.shape, test_y.shape)
print('balance:', balance_x.shape, balance_y.shape)

train: (5400, 4368) (5400,)
dev1: (3960, 4368) (3960,)
dev2: (1500, 4368) (1500,)
test: (3000, 4368) (3000,)
balance: (7200, 4368) (7200,)


In [None]:
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
balance_x = scaler.transform(balance_x)
test_x = scaler.transform(test_x)

In [6]:
pca = PCA(n_components=50)
pca.fit(train_x)
train_x = pca.transform(train_x)
balance_x = pca.transform(balance_x)
test_x = pca.transform(test_x)

In [None]:
ica = FastICA(n_components=50)
ica.fit(train_x)
train_x = ica.transform(train_x)
balance_x = ica.transform(balance_x)
test_x = ica.transform(test_x)

In [16]:
class ALCModel:
    def __init__(self, method, **kwarg):
        if method == 'logistic':
            # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
            self.clf = LogisticRegression(**kwarg)
        elif method == 'neighbor':
            # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
            self.clf = KNeighborsClassifier(**kwarg)
        elif method == 'svm':
            # https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
            self.clf = SVC(**kwarg)
        elif method == 'forest':
            # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
            self.clf = RandomForestClassifier(**kwarg)
        elif method == 'adaboost':
            # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
            self.clf = AdaBoostClassifier(**kwarg)
        elif method == 'gradboost':
            # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
            self.clf = GradientBoostingClassifier(**kwarg)
        else:
            raise NotImplementedError
        
    def fit(self, x, y):
        self.clf.fit(x, y)
    
    def predict(self, x):
        prediction = self.clf.predict(x)
        return prediction
    
    def evaluate(self, x, label, roc=True):
        pred = self.predict(x)
        acc = np.mean(pred == label)
        report = sklearn.metrics.classification_report(label, pred)
        if roc:
            curve = plot_roc_curve(self.clf, x, label)
            plt.show()
        return acc, report

In [None]:
model = ALCModel('logistic', penalty='l2', C=1.0, n_jobs=8)
model.fit(balance_x, balance_y)
acc, report = model.evaluate(test_x, test_y, roc=True)
print(report)

In [None]:
model = ALCModel('neighbor', n_neighbors=5, leaf_size=30, n_jobs=8)
model.fit(balance_x, balance_y)
acc, report = model.evaluate(test_x, test_y, roc=True)
print(report)

In [None]:
model = ALCModel('svm', C=1.0, kernel='rbf')
model.fit(balance_x, balance_y)
acc, report = model.evaluate(test_x, test_y, roc=True)
print(report)

In [None]:
model = ALCModel('forest', n_estimators=100, n_jobs=8)
model.fit(balance_x, balance_y)
acc, report = model.evaluate(test_x, test_y, roc=True)
print(report)

In [None]:
model = ALCModel('adaboost', n_estimators=100, learning_rate=1.0)
model.fit(balance_x, balance_y)
acc, report = model.evaluate(test_x, test_y, roc=True)
print(report)

In [None]:
model = ALCModel('gradboost', n_estimators=100, learning_rate=0.1, verbose=True)
model.fit(balance_x, balance_y)
acc, report = model.evaluate(test_x, test_y, roc=True)
print(report)