<h1> Import libraries and datasets </h1>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split as TTS,  GridSearchCV  
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB as NB


import nltk
from nltk.corpus import stopwords, sentiwordnet, wordnet
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

import spacy

from typing import List
from pprint import pprint


import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis.gensim

import tqdm

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping


import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jakob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train = pd.read_csv('data/lotr_train.csv')
test = pd.read_csv('data/lotr_test.csv')

imp_char = ["FRODO", "SAM", "GANDALF", "PIPPIN", "MERRY", "GOLLUM", "GIMLI", "THEODEN", "FARAMIR", "SAURON", "ARAGORN", "SMEAGOL"]

FileNotFoundError: [Errno 2] No such file or directory: 'data/lotr_train.csv'

## Character prediction
### Divide and conquer

In [None]:
# Creating a common label for the characters not of interest. 
# Aware that this will impact the model, unsure if it would be positive or negative


def common_label_removal(data):
    mask = data["char"].isin(imp_char)
    data.loc[~ mask, "char"] = "Rest"
    mask2 = data['char'] == 'Rest'
    data = data[~mask2]
    return data

train = common_label_removal(train)
test = common_label_removal(test)


def x_y_split(data):
    y_data = data['char']
    x_data = data.drop(columns=['char', 'dialog'])
    return x_data, y_data

X_train_org, y_train_org = x_y_split(train)
X_test_org, y_test_org = x_y_split(test)

def char_2_num(y_data):
    encoder = LabelEncoder()
    y_data = y_data.values.reshape(-1, 1)
    encoded_data = encoder.fit_transform(y_data)
    names = list(encoder.inverse_transform(np.unique(encoded_data)))
    print(names)
    print(np.unique(encoded_data))
    return encoded_data, names

y_train_org, names = char_2_num(y_train_org)
y_test_org = char_2_num(y_test_org)[0]


['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'SAURON', 'SMEAGOL', 'THEODEN']
[ 0  1  2  3  4  5  6  7  8  9 10 11]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'SAURON', 'SMEAGOL', 'THEODEN']
[ 0  1  2  3  4  5  6  7  8  9 10 11]


### Naive Benchmark model

In [None]:
eval_methods = [ f1_score, precision_score, recall_score]

def naive_model(x_data, y_data):
    pred = np.random.randint(0, 12, size=len(x_data))
    print(classification_report(y_data, pred))
    print("Accuracy ", round(accuracy_score(y_data, pred), 4)) 
    
    for e in eval_methods:
        print(str(e.__name__), round(e(y_data, pred, average='weighted'), 4)) 
    return pred   

naive_predicitons = naive_model(X_train_org, y_train_org)

              precision    recall  f1-score   support

           0       0.09      0.07      0.08       136
           1       0.04      0.08      0.05        51
           2       0.08      0.05      0.06       167
           3       0.14      0.09      0.11       138
           4       0.04      0.05      0.05        84
           5       0.09      0.09      0.09       102
           6       0.10      0.08      0.09       102
           7       0.17      0.13      0.15       119
           8       0.13      0.08      0.10       143
           9       0.01      0.20      0.02         5
          10       0.01      0.03      0.02        31
          11       0.06      0.08      0.07        74

    accuracy                           0.08      1152
   macro avg       0.08      0.09      0.07      1152
weighted avg       0.10      0.08      0.09      1152

Accuracy  0.079
f1_score 0.0856
precision_score 0.0992
recall_score 0.079


### Random Forest Classifier
ON DATASET "A"

In [None]:

def optimize_model_parameters(X, y, model, param_grid, cv=5):
 
    rfc = model()

    grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1, error_score='raise')
    grid_search.fit(X, y)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("Best Parameters:", best_params)
    print("Best Accuracy Score:", best_score)


    optimized = model(**best_params)
    optimized.fit(X, y)
    return optimized


param_grid = {
    'n_estimators': [30,35,45,55,65,75,85,95],
    'max_depth': [6,9,12,15,18,21,24,27,30],
    'random_state':[42]
    # 'min_samples_split': [ 5, 10, 15],
    # #'min_samples_leaf': [1, 2, 3, 4],
    # 'criterion': ['gini', 'entropy']
}

# Optimize parameters
optimized_rf = optimize_model_parameters(X_train_org, y_train_org, RFC, param_grid)

Best Parameters: {'max_depth': 15, 'n_estimators': 85, 'random_state': 42}
Best Accuracy Score: 0.22484848484848485


In [None]:
def evaluate_model(x_data, y_data, model):

    pred = model.predict(x_data)
    print(classification_report(y_data, pred))
    print("Accuracy ", round(accuracy_score(y_data, pred), 4)) 
    
    for e in eval_methods:
        print(str(e.__name__), round(e(y_data, pred, average='weighted'), 4)) 
    return pred  
rfc_predictions = evaluate_model(lotr_test_X, lotr_test_Y, optimized_rf) 

              precision    recall  f1-score   support

           0       0.13      0.17      0.15        53
           1       0.00      0.00      0.00        14
           2       0.25      0.35      0.29        74
           3       0.29      0.36      0.32        76
           4       0.12      0.10      0.11        31
           5       0.05      0.03      0.04        32
           6       0.15      0.14      0.14        35
           7       0.11      0.11      0.11        44
           8       0.25      0.23      0.24        75
           9       0.00      0.00      0.00         2
          10       0.17      0.06      0.08        18
          11       0.05      0.03      0.04        36

    accuracy                           0.19       490
   macro avg       0.13      0.13      0.13       490
weighted avg       0.18      0.19      0.18       490

Accuracy  0.1939
f1_score 0.1817
precision_score 0.1766
recall_score 0.1939


In [None]:
def conf_matrix(y, pred):
    cm = confusion_matrix(y, pred)
    fig, ax = plt.subplots(figsize=(10,10)) 
    sns.heatmap(cm/np.sum(cm), annot=True, 
                fmt='.1%', cmap='Blues', ax=ax, 
                xticklabels=names, yticklabels=names)
    plt.xlabel('Predicted Label')
    plt.ylabel('Actual Label')
    plt.show()

# conf_matrix(lotr_test_Y, rfc_predictions)

ON DATASET "A+B"

In [None]:
test_w2v = pd.read_csv('data/test_w2v.csv')
train_w2v = pd.read_csv('data/train_w2v.csv')


X_train_org.reset_index(drop=True, inplace=True)
X_test_org.reset_index(drop=True, inplace=True)

X_train_all = pd.concat([X_train_org, train_w2v], axis=1)
X_test_all = pd.concat([X_test_org, test_w2v], axis=1)

In [None]:
optimized_rf2 = optimize_model_parameters(X_train_org, y_train_org, RFC, param_grid)
rfc2_predictions = evaluate_model(X_test_org, y_test_org, optimized_rf2) 
# conf_matrix(lotr_test_Y, rfc2_predictions)

Best Parameters: {'max_depth': 15, 'n_estimators': 85, 'random_state': 42}
Best Accuracy Score: 0.22484848484848485
              precision    recall  f1-score   support

           0       0.13      0.17      0.15        53
           1       0.00      0.00      0.00        14
           2       0.25      0.35      0.29        74
           3       0.29      0.36      0.32        76
           4       0.12      0.10      0.11        31
           5       0.05      0.03      0.04        32
           6       0.15      0.14      0.14        35
           7       0.11      0.11      0.11        44
           8       0.25      0.23      0.24        75
           9       0.00      0.00      0.00         2
          10       0.17      0.06      0.08        18
          11       0.05      0.03      0.04        36

    accuracy                           0.19       490
   macro avg       0.13      0.13      0.13       490
weighted avg       0.18      0.19      0.18       490

Accuracy  0.1939


<h1> Feedforward neural network

In [None]:
from keras import layers
from keras.layers import LSTM, Dense, Dropout, BatchNormalization
from keras import Sequential, layers, Input, callbacks

In [None]:
y1 = np.eye(12)[y_train_org]
y2 = np.eye(12)[y_test_org]

In [None]:
model = keras.Sequential([
    layers.Dense(8, activation='relu',input_dim=190),
    layers.BatchNormalization(),
    layers.Dropout(rate=0.3),
    layers.Dense(8, activation='selu'),
    # layers.BatchNormalization(),
    # layers.Dropout(0.3),
    # layers.Dense(254, activation='softmax'),
    layers.Dense(12)
])
model.compile(optimizer='adam',
              loss = 'categorical_crossentropy',
              metrics=['accuracy']
              )

# early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=35, # how many epochs to wait before stopping
    restore_best_weights=True,
)
model.fit(X_train_all, y1, 
          validation_data= (X_test_all, y2),
        # validation_split=0.3,
          epochs=200, batch_size=5, 
          callbacks=early_stopping
          )

IndentationError: unexpected indent (759619564.py, line 17)

In [None]:
model.evaluate(X_test_all, y2)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.0976 - loss: 7.4321 


[7.598530292510986, 0.11224489659070969]