In [89]:
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import utils

feature_extracted_files = "extracted"
original_files = "processed"


In [90]:

def load_data(file_dir):
    all_files = os.listdir(file_dir)
    train_files, test_files = train_test_split(all_files, test_size=0.1)
    X_train, y_train, X_test, y_test = [], [], [], []

    for data_file in tqdm(train_files, desc="Extracting train file"):
        data_file_path = os.path.join(file_dir, data_file)
        with open(data_file_path, "r") as f:
            for line in f.readlines():
                tokens = line.split("\t")
                features = np.fromstring(tokens[1], sep=" ")
                label = tokens[2].strip()
                X_train.append(features)
                y_train.append(label)

    for data_file in tqdm(test_files, desc="Extracting test file"):
        data_file_path = os.path.join(file_dir, data_file)
        with open(data_file_path, "r") as f:
            for line in f.readlines():
                tokens = line.split("\t")
                X_test.append(np.fromstring(tokens[1], sep=" "))
                y_test.append(tokens[2].strip())
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(original_files)

Extracting train file: 100%|██████████| 180/180 [00:02<00:00, 78.19it/s]
Extracting test file: 100%|██████████| 20/20 [00:00<00:00, 134.16it/s]


In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler


def standardise_dataset(X_train, X_test):
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    return X_train, X_test

X_train, X_test = standardise_dataset(X_train, X_test)

In [92]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def eval_model(y_test, y_pred_rfc):
    accuracy = accuracy_score(y_test, y_pred_rfc)
    precision = precision_score(y_test, y_pred_rfc, average="macro")
    recall = recall_score(y_test, y_pred_rfc, average="macro")
    f1 = f1_score(y_test, y_pred_rfc, average="macro")
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1: {}".format(f1))

In [93]:
rfc = RandomForestClassifier(n_estimators=50) 
 
# param_grid = { 
#     'n_estimators': [200, 400, 600, 800],
# }

# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=ShuffleSplit(test_size=0.20, n_splits=1))
# CV_rfc.fit(X_train, y_train)
# y_pred = CV_rfc.predict(X_test)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
eval_model(y_test, y_pred_rfc)

with open("rf.p", "wb") as f:
    pickle.dump(rfc, f)

Accuracy: 0.7992895204262878
Precision: 0.6156880813200104
Recall: 0.4762147468446122
F1: 0.5112062346091091


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [94]:
from sklearn.svm import SVC
svm = SVC(gamma='auto')

svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
eval_model(y_test, y_pred_svm)

with open("svm.p", "wb") as f:
    pickle.dump(svm, f)

Accuracy: 0.9966449575685811
Precision: 0.7961538461538462
Recall: 0.7978898929746387
F1: 0.79697523926379


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [95]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(80, 50))
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
eval_model(y_test, y_pred_mlp)

with open("mlp.p", "wb") as f:
    pickle.dump(mlp, f)

Accuracy: 0.5279257943556345
Precision: 0.1717360239960605
Recall: 0.12599938867105748
F1: 0.10392516694551085


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
