In [None]:
# Import packages
from gensim.models.fasttext import load_facebook_model
from gensim.models.fasttext import load_facebook_vectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import sklearn.ensemble
import sklearn
import optuna

In [None]:
# Load FastText model
fasttext_model = load_facebook_vectors('../../Downloads/CHEME DIRECT/NLP Project/cc.en.300.bin')

In [None]:
# Define a class for embedding
class FastTextVectorizer:
    def __init__(self, fasttext_model):
        self.fasttext_model = fasttext_model

    def transform(self, X):
        return np.array([
            np.mean([self.fasttext_model[w] for w in words.split() if w in self.fasttext_model]
                    or [np.zeros(self.fasttext_model.vector_size)], axis=0)
            for words in X
        ])

In [None]:
# Load dataset
df1 = pd.read_excel('../../Downloads/CHEME DIRECT/NLP Project/PSE Dataset.xlsx', sheet_name='SamePerson Report')
df2 = pd.read_excel('../../Downloads/CHEME DIRECT/NLP Project/PSE Dataset.xlsx', sheet_name='Multiple People Report')
df3 = pd.read_excel('../../Downloads/CHEME DIRECT/NLP Project/PSE Dataset.xlsx', sheet_name='Multiple People Less Details')

In [None]:
# Split training and testing dataset
dev_1, test_1 = sklearn.model_selection.train_test_split(df1, test_size=0.2, random_state=26)
dev_2, test_2 = sklearn.model_selection.train_test_split(df2, test_size=0.2, random_state=26)
dev_3, test_3 = sklearn.model_selection.train_test_split(df3, test_size=0.2, random_state=26)

In [None]:
# Get text and label from each dataset
dev_1_text = dev_1['Report'].values.reshape(-1,1)
test_1_text = test_1['Report'].values.reshape(-1,1)

dev_1_label = dev_1['Level'].values.reshape(-1,1)
test_1_label = test_1['Level'].values.reshape(-1,1)

dev_2_text = dev_2['Report'].values.reshape(-1,1)
test_2_text = test_2['Report'].values.reshape(-1,1)

dev_2_label = dev_2['Level'].values.reshape(-1,1)
test_2_label = test_2['Level'].values.reshape(-1,1)

dev_3_text = dev_3['Report'].values.reshape(-1,1)
test_3_text = test_3['Report'].values.reshape(-1,1)

dev_3_label = dev_3['Level'].values.reshape(-1,1)
test_3_label = test_3['Level'].values.reshape(-1,1)

In [None]:
# Define a function to convert texts and labels into format of list
def to_list(data):
    list_of_list = data.tolist()
    new_data = [item for sublist in list_of_list for item in sublist]
    
    return new_data

In [None]:
# Convert to lists
dev_1_text_str = to_list(dev_1_text)
dev_1_label_str = to_list(dev_1_label)

dev_2_text_str = to_list(dev_2_text)
dev_2_label_str = to_list(dev_2_label)

dev_3_text_str = to_list(dev_3_text)
dev_3_label_str = to_list(dev_3_label)

test_1_text_str = to_list(test_1_text)
test_1_label_str = to_list(test_1_label)

test_2_text_str = to_list(test_2_text)
test_2_label_str = to_list(test_2_label)

test_3_text_str = to_list(test_3_text)
test_3_label_str = to_list(test_3_label)

In [None]:
# Concatenate three sets together
dev_text = dev_1_text_str + dev_2_text_str + dev_3_text_str
test_text = test_1_text_str + test_2_text_str + test_3_text_str

dev_label = dev_1_label_str + dev_2_label_str + dev_3_label_str
test_label = test_1_label_str + test_2_label_str + test_3_label_str

In [None]:
# Transform raw text data to vectors
vectorizer = FastTextVectorizer(fasttext_model)
dev_text_ft = vectorizer.transform(dev_text)
test_text_ft = vectorizer.transform(test_text)

In [None]:
# Split training set to have a validation set
train_x, val_x, train_y, val_y = sklearn.model_selection.train_test_split(
    dev_text_ft, dev_label, test_size=0.2, random_state=26)

In [None]:
# Hyperparameter optimization using Optuna
def train_evaluate_hyperparameters(n_estimators, max_depth, min_weight_fraction_leaf):
    # create model
    model = sklearn.ensemble.RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth, min_weight_fraction_leaf=min_weight_fraction_leaf)
    # train the model on the training set
    model.fit(train_x, train_y)
    # evaluate the model on the validation set
    score = model.score(val_x, val_y)
    return score

In [None]:
def objective(trial):
    # define hyperparameter space
    n_estimators = trial.suggest_int('n_estimators', 100, 5000)
    max_depth = trial.suggest_int('max_depth', 1, 600)
    min_weight_fraction_leaf = trial.suggest_uniform(
        'min_weight_fraction_leaf', 0.0, 0.5)
    
    # get the score for the hyperparameters chosen
    score = train_evaluate_hyperparameters(n_estimators, max_depth, min_weight_fraction_leaf)
    return score

study = optuna.create_study(sampler=optuna.samplers.TPESampler(), direction='maximize')
study.optimize(objective, n_trials=50)
print('Best params: ', study.best_params)

In [None]:
# Train a Random Forest model
model = sklearn.ensemble.RandomForestClassifier(
    n_estimators=871, min_weight_fraction_leaf=0.001978389039761175, max_depth=472)

model.fit(dev_text_ft, dev_label)

In [None]:
# Accuracy report
print("model accuracy:", model.score(test_text_ft, test_label))

predict_label = model.predict(test_text_ft)

print(sklearn.metrics.classification_report(test_label, predict_label, digits=4))

In [None]:
# Plot a confusion matrix
sklearn.metrics.ConfusionMatrixDisplay.from_estimator(clf, test_text_ft, test_label)