In [25]:
import numpy as np
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import utils

all_files = os.listdir("extracted")
train_files, test_files = train_test_split(all_files, test_size=0.1)
X_train, y_train, X_test, y_test = [], [], [], []

for data_file in tqdm(train_files, desc="Extracting train file"):
    data_file_path = os.path.join("extracted", data_file)
    with open(data_file_path, "r") as f:
        for line in f.readlines():
            tokens = line.split("\t")
            X_train.append(np.fromstring(tokens[1], sep=" "))
            y_train.append(tokens[2].strip())

for data_file in tqdm(test_files, desc="Extracting test file"):
    data_file_path = os.path.join("extracted", data_file)
    with open(data_file_path, "r") as f:
        for line in f.readlines():
            tokens = line.split("\t")
            X_test.append(np.fromstring(tokens[1], sep=" "))
            y_test.append(tokens[2].strip())

Extracting train file: 100%|██████████| 180/180 [00:01<00:00, 105.16it/s]
Extracting test file: 100%|██████████| 20/20 [00:00<00:00, 133.04it/s]


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.preprocessing import StandardScaler


def standardise_dataset(X_train, X_test):
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    return X_train, X_test

X_train, X_test = standardise_dataset(X_train, X_test)

rfc = RandomForestClassifier(n_estimators=50) 
 
param_grid = { 
    'n_estimators': [200, 400, 600, 800],
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=ShuffleSplit(test_size=0.20, n_splits=1))
CV_rfc.fit(X_train, y_train)
y_pred = CV_rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")
print("Accuracy: {}".format(accuracy))
print("Precision: {}".format(precision))
print("Recall: {}".format(recall))
print("F1: {}".format(f1))
