In [50]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import utils

feature_extracted_files = os.listdir("extracted")
original_files = os.listdir("processed")


In [51]:

def load_data(file_dir):
    train_files, test_files = train_test_split(all_files, test_size=0.1)
    X_train, y_train, X_test, y_test = [], [], [], []

    for data_file in tqdm(train_files, desc="Extracting train file"):
        data_file_path = os.path.join("extracted", data_file)
        with open(data_file_path, "r") as f:
            for line in f.readlines():
                tokens = line.split("\t")
                X_train.append(np.fromstring(tokens[1], sep=" "))
                y_train.append(tokens[2].strip())

    for data_file in tqdm(test_files, desc="Extracting test file"):
        data_file_path = os.path.join("extracted", data_file)
        with open(data_file_path, "r") as f:
            for line in f.readlines():
                tokens = line.split("\t")
                X_test.append(np.fromstring(tokens[1], sep=" "))
                y_test.append(tokens[2].strip())
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = load_data(original_files)

Extracting train file: 100%|██████████| 180/180 [00:00<00:00, 478.97it/s]
Extracting test file: 100%|██████████| 20/20 [00:00<00:00, 399.07it/s]


In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler


def standardise_dataset(X_train, X_test):
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    return X_train, X_test

X_train, X_test = standardise_dataset(X_train, X_test)

In [53]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def eval_model(y_test, y_pred_rfc):
    accuracy = accuracy_score(y_test, y_pred_rfc)
    precision = precision_score(y_test, y_pred_rfc, average="macro")
    recall = recall_score(y_test, y_pred_rfc, average="macro")
    f1 = f1_score(y_test, y_pred_rfc, average="macro")
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1: {}".format(f1))

In [54]:
rfc = RandomForestClassifier(n_estimators=50) 
 
# param_grid = { 
#     'n_estimators': [200, 400, 600, 800],
# }

# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=ShuffleSplit(test_size=0.20, n_splits=1))
# CV_rfc.fit(X_train, y_train)
# y_pred = CV_rfc.predict(X_test)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
eval_model(y_test, y_pred_rfc)

Accuracy: 0.8028747433264887
Precision: 0.2082598316513286
Recall: 0.16349761010074576
F1: 0.1660895316748018


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [55]:
from sklearn.svm import SVC
svm = SVC(gamma='auto')

svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
eval_model(y_test, y_pred_svm)

Accuracy: 0.8709298914637724
Precision: 0.22770743999116352
Recall: 0.17681185542561215
F1: 0.1883566371223618


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [56]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2))
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_test)
eval_model(y_test, y_pred_mlp)

Accuracy: 0.5855089469052508
Precision: 0.09906536751249011
Recall: 0.07300653636709302
F1: 0.08080505900664132


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
