In [None]:
# Import packages
import zipfile
import urllib.request

import pandas as pd
import numpy as np
import sklearn.metrics
import sklearn.ensemble
import sklearn.model_selection

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('talk')

import optuna

In [None]:
# Download pre-trained GloVe vectors from the website
URL = 'https://nlp.stanford.edu/data/glove.6B.zip'
FILE_NAME = 'glove.6B.zip'
urllib.request.urlretrieve(URL, FILE_NAME)

with zipfile.ZipFile(FILE_NAME, 'r') as zip_ref:
    zip_ref.extractall()

# Transfer pre-trained GloVe vectors into word2vec format
GLOVE_FILE = 'glove.6B.100d.txt'
WORD2VEC_FILE = GLOVE_FILE + '.word2vec'
glove2word2vec(GLOVE_FILE, WORD2VEC_FILE)

# Load pre-trained GloVe vectors
glove_model = KeyedVectors.load_word2vec_format(WORD2VEC_FILE, binary=False)

In [None]:
# Load dataset
df1 = pd.read_excel('../../../Downloads/CHEME DIRECT/NLP Project/PSE Dataset.xlsx', sheet_name='SamePerson Report')
df2 = pd.read_excel('../../../Downloads/CHEME DIRECT/NLP Project/PSE Dataset.xlsx', sheet_name='Multiple People Report')
df3 = pd.read_excel('../../../Downloads/CHEME DIRECT/NLP Project/PSE Dataset.xlsx', sheet_name='Multiple People Less Details')

In [None]:
# Split training and testing dataset
dev_1, test_1 = sklearn.model_selection.train_test_split(df1, test_size=0.2, random_state=26)
dev_2, test_2 = sklearn.model_selection.train_test_split(df2, test_size=0.2, random_state=26)
dev_3, test_3 = sklearn.model_selection.train_test_split(df3, test_size=0.2, random_state=26)

In [None]:
# Get text and label from each dataset
dev_1_text = dev_1['Report'].values.reshape(-1,1)
test_1_text = test_1['Report'].values.reshape(-1,1)

dev_1_label = dev_1['Level'].values.reshape(-1,1)
test_1_label = test_1['Level'].values.reshape(-1,1)

dev_2_text = dev_2['Report'].values.reshape(-1,1)
test_2_text = test_2['Report'].values.reshape(-1,1)

dev_2_label = dev_2['Level'].values.reshape(-1,1)
test_2_label = test_2['Level'].values.reshape(-1,1)

dev_3_text = dev_3['Report'].values.reshape(-1,1)
test_3_text = test_3['Report'].values.reshape(-1,1)

dev_3_label = dev_3['Level'].values.reshape(-1,1)
test_3_label = test_3['Level'].values.reshape(-1,1)

In [None]:
# Define functions for tokenizing the text data
class Word2VecVectorizer:
    def __init__(self, model_vec):
        print("Loading in word vectors...")
        self.word_vectors = model_vec
        print("Finished loading in word vectors")

    def fit(self, data):
        """fit data"""

    def transform(self, data):
        """determine the dimensionality of vectors"""
        v_get = self.word_vectors.get_vector('king')
        self.D = v_get.shape[0]

        x_vec = np.zeros((len(data), self.D))
        n_count = 0
        emptycount = 0

        for sentence in data:
            tokens = sentence.split()
            vecs = []
            m_count = 0
            for word in tokens:
                try:
                    # throws KeyError if word not found
                    vec = self.word_vectors.get_vector(word)
                    vecs.append(vec)
                    m_count += 1
                except KeyError:
                    pass

            if len(vecs) > 0:
                vecs = np.array(vecs)
                x_vec[n_count] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n_count += 1

        print("Numer of samples with no words found: %s / %s" % (emptycount,
        	len(data)))
        return x_vec

    def fit_transform(self, data):
        """transform the strings to vectors"""
        self.fit(data)

        return self.transform(data)

In [None]:
# Define a function to convert texts and labels into format of list
def to_list(data):
    list_of_list = data.tolist()
    new_data = [item for sublist in list_of_list for item in sublist]
    
    return new_data

In [None]:
# Convert to lists
dev_1_text_str = to_list(dev_1_text)
dev_1_label_str = to_list(dev_1_label)

dev_2_text_str = to_list(dev_2_text)
dev_2_label_str = to_list(dev_2_label)

dev_3_text_str = to_list(dev_3_text)
dev_3_label_str = to_list(dev_3_label)

test_1_text_str = to_list(test_1_text)
test_1_label_str = to_list(test_1_label)

test_2_text_str = to_list(test_2_text)
test_2_label_str = to_list(test_2_label)

test_3_text_str = to_list(test_3_text)
test_3_label_str = to_list(test_3_label)

In [None]:
# Concatenate three sets together
dev_text = dev_1_text_str + dev_2_text_str + dev_3_text_str
test_text = test_1_text_str + test_2_text_str + test_3_text_str

dev_label = dev_1_label_str + dev_2_label_str + dev_3_label_str
test_label = test_1_label_str + test_2_label_str + test_3_label_str

In [None]:
# Set a word vectorizer
vectorizer = Word2VecVectorizer(glove_model)

# Get the sentence embeddings for the train dataset
dev_x = vectorizer.fit_transform(dev_text)
dev_y = dev_label

# Get the sentence embeddings for the test dataset
test_x = vectorizer.transform(test_text)
test_y = test_label

In [None]:
# Split training set to have a validation set
train_x, val_x, train_y, val_y = sklearn.model_selection.train_test_split(
    dev_x, dev_y, test_size=0.2, random_state=26)

In [None]:
# Hyperparameter optimization using Optuna
def train_evaluate_hyperparameters(n_estimators, max_depth, min_weight_fraction_leaf):
    # create model
    model = sklearn.ensemble.RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth, min_weight_fraction_leaf=min_weight_fraction_leaf)
    # train the model on the training set
    model.fit(train_x, train_y)
    # evaluate the model on the validation set
    score = model.score(val_x, val_y)
    return score

In [None]:
def objective(trial):
    # define hyperparameter space
    n_estimators = trial.suggest_int('n_estimators', 100, 5000)
    max_depth = trial.suggest_int('max_depth', 1, 600)
    min_weight_fraction_leaf = trial.suggest_uniform(
        'min_weight_fraction_leaf', 0.0, 0.5)
    
    # get the score for the hyperparameters chosen
    score = train_evaluate_hyperparameters(n_estimators, max_depth, min_weight_fraction_leaf)
    return score

study = optuna.create_study(sampler=optuna.samplers.TPESampler(), direction='maximize')
study.optimize(objective, n_trials=50)
print('Best params: ', study.best_params)

In [None]:
# Train a Random Forest model
model = sklearn.ensemble.RandomForestClassifier(
    n_estimators=4974, min_weight_fraction_leaf=0.008440029412940517, max_depth=18)

model.fit(dev_x, dev_y)

In [None]:
# Accuracy report
print("model accuracy:", model.score(test_x, test_y))

predict_y = model.predict(test_x)

print(sklearn.metrics.classification_report(test_y, predict_y, digits=4))

In [None]:
# Plot a confusion matrix
sklearn.metrics.ConfusionMatrixDisplay.from_estimator(model, test_x, test_y)