<h1> Modelling and evaluation </h1>
<h2> 1. Import and download </h2>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score as ACC
from sklearn.ensemble import RandomForestClassifier

from tensorflow import keras
from keras import layers
from keras.layers import RNN, Dense, Dropout, BatchNormalization
from keras import Sequential, layers, Input, callbacks

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing all the datasets
train_A = pd.read_csv('data/train_A.csv')
train_B = pd.read_csv('data/train_B.csv')
train_C = pd.read_csv('data/train_C.csv')

val_A = pd.read_csv('data/val_A.csv')
val_B = pd.read_csv('data/val_B.csv')
val_C = pd.read_csv('data/val_C.csv')

test_A = pd.read_csv('data/test_A.csv')
test_B = pd.read_csv('data/test_B.csv')
test_C = pd.read_csv('data/test_C.csv')

train_D = pd.read_csv('data/train_D.csv')
val_D= pd.read_csv('data/val_D.csv')
test_D = pd.read_csv('data/test_D.csv')

<h2> 2. Data preprocessing </h2>

In [3]:
datasets = [
            train_A, val_A, test_A, 
            # train_B, val_B, test_B, 
            train_C, val_C, test_C,
            train_D, val_D, test_D]


In [33]:
train_A

Unnamed: 0,char,dialog,word_len,character_len,stopword_count,verb_count,adj_count,propn_count,uinque_words,dialog_sentiment,...,fear,anger,positive,negative,disgust,surprise,joy,anticipation,sadness,avg_tf-idf
0,Rest,"Grond, Grond, Grond, Grond!",4,29,0,0,0,4,2,0.000,...,0,0,0,0,0,0,0,0,0,0.000000
1,FRODO,Smeagol?,1,10,0,0,0,0,1,0.000,...,0,0,0,0,0,0,0,0,0,0.000000
2,SAM,Look!The gate.It's opening!I can see a way down.,8,49,2,1,0,0,8,0.125,...,0,0,0,0,0,0,0,0,0,1.286746
3,PIPPIN,"Well, that's good news.",4,24,0,0,1,0,4,0.500,...,0,0,0,0,0,0,0,1,0,1.231094
4,PIPPIN,Frodo.,1,11,0,0,0,1,1,0.000,...,0,0,0,0,0,0,0,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667,Rest,Your words are poison.,4,27,2,0,0,0,4,-0.125,...,0,2,0,0,0,0,0,0,0,1.800083
1668,Rest,A new power is rising.Its victory is at hand.,9,46,4,1,1,0,8,1.500,...,0,0,0,0,0,0,0,1,0,0.817054
1669,Rest,"Come on then , come on!",6,31,2,2,0,0,6,0.375,...,0,0,0,0,0,0,0,0,0,2.175262
1670,SAM,Mr. Frodo!,2,11,0,0,0,2,2,0.000,...,0,0,0,0,0,0,0,0,0,0.000000


In [4]:
imp_char = ["FRODO", "SAM", "GANDALF", "PIPPIN", "MERRY", "GOLLUM", "GIMLI", "THEODEN", "FARAMIR", "ARAGORN"]

# Creating a common label for the characters not of interest
def common_label_removal(data):
    mask = data["char"].isin(imp_char)
    data.loc[~ mask, "char"] = "Rest"
    mask2 = data['char'] == 'Rest'
    data = data[~mask2]
    return data

def x_y_split(data):
    y_data = data['char']
    x_data = data.drop(columns=['char', 'dialog'])
    return x_data, y_data

def char_2_num(y_data):
    encoder = LabelEncoder()
    y_data = y_data.values.reshape(-1, 1)
    encoded_data = encoder.fit_transform(y_data)
    names = list(encoder.inverse_transform(np.unique(encoded_data)))
    print(names)
    print(np.unique(encoded_data))
    return encoded_data, names

def preprocessing(data):
    data = common_label_removal(data)
    x_data, y_data = x_y_split(data)
    y_data = char_2_num(y_data)
    return x_data, y_data

for i in range(len(datasets)):
    datasets[i] = preprocessing(datasets[i])

['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 'GANDALF', 'GIMLI', 'GOLLUM', 'MERRY', 'PIPPIN', 'SAM', 'THEODEN']
[0 1 2 3 4 5 6 7 8 9]
['ARAGORN', 'FARAMIR', 'FRODO', 

In [5]:
A_tra_X =datasets[0][0]
A_tra_y =datasets[0][1][0]
A_val_X =datasets[1][0]
A_val_y=datasets[1][1][0]
A_tar_X=datasets[2][0]
A_tar_y=datasets[2][1][0]

C_tra_X =datasets[3][0]
C_tra_y =datasets[3][1][0]
C_val_X =datasets[4][0]
C_val_y=datasets[4][1][0]
C_tar_X=datasets[5][0]
C_tar_y=datasets[5][1][0]

D_tra_X =datasets[6][0]
D_tra_y =datasets[6][1][0]
D_val_X =datasets[7][0]
D_val_y=datasets[7][1][0]
D_tar_X=datasets[8][0]
D_tar_y=datasets[8][1][0]

In [6]:
D_val_y.shape

(254,)

<h2> 2. Benchmarks </h2>
<h3> 2.1 Naive Benchmark, Monte Carlo Method </h3>
<p> Using 1000 simulations with random guesses on target labels. </p>

In [7]:
def naive_benchmark_MonC(y):
    accuracy_list = []
    for i in range(0,1000,1):
        naive_rand_pred = np.random.randint(0,12,size=(len(y)))
        accuracy_sel = ACC(naive_rand_pred, y)
        accuracy_list.append(accuracy_sel)
    return np.mean(accuracy_list)

In [8]:
naive_benchmark_MonC(A_tar_y)

0.08334188034188034

<h3> 2.2 Naive Benchmark, Majority Class Method </h3>
<p> Using Frodo, which equals label 2, as guess </p>

In [9]:
def naive_benchmark_MajC(y):
    pred_MCNB =np.repeat(2,len(y))
    return ACC(pred_MCNB, y)

In [10]:
naive_benchmark_MajC(A_tar_y)

0.1752136752136752

<h2> 3. Modelling  </h2>
<h3> 3.1 ANN on dataset A</h3>
<p> Dataset A contains various numerical retrieved from the characters. </p>
<p> The feedforward neural network has a relative simple architecture.

In [11]:
scaler = StandardScaler()
# A1 = scaler.fit_transform(D_tra_X)
# A2 = scaler.transform(D_val_X)
# A3 = scaler.transform(D_tar_X)
A1 = D_tra_X.reset_index(drop=True)
A2 = D_val_X.reset_index(drop=True)
A3 = D_tar_X.reset_index(drop=True)

Y1 = np.eye(10)[D_tra_y]
Y2 = np.eye(10)[D_val_y]
Y3 = np.eye(10)[D_tar_y]

In [12]:
# ann_model = keras.Sequential([
#     layers.Dense(8, activation='relu',input_dim=20),
#     layers.BatchNormalization(),
#     layers.Dropout(rate=0.3),
#     # layers.Dense(16, activation='selu'),
#     # layers.BatchNormalization(),
#     # layers.Dropout(0.3),
#     layers.Dense(10, activation='softmax'),
#     layers.Dense(10)
# ])

# optimizer = keras.optimizers.Adam(learning_rate=0.01)
# ann_model.compile(optimizer=optimizer,
#               loss = 'categorical_crossentropy',
#               metrics=['accuracy']
#               )

# early_stopping = callbacks.EarlyStopping(
#     min_delta=0.001, # minimium amount of change to count as an improvement
#     patience=35, # how many epochs to wait before stopping
#     restore_best_weights=True,
# )
# ann_model.fit(A1, Y1, 
#           validation_data= (A2, Y2),
#           epochs=200, batch_size=10, 
#           callbacks=early_stopping,
#           verbose=0
#           )

# print('Accuracy train: ',ann_model.evaluate(A1, Y1))
# print('Accuracy validation: ',ann_model.evaluate(A2, Y2))
# print('Accuracy test: ',ann_model.evaluate(A3, Y3))

In [13]:
from xgboost import XGBClassifier 
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [14]:
p_g = {
    'objective':['multi:softprob'],
    'alpha': hp.uniform('alpha',0,1),
    'gamma': hp.uniform('gamma',0,9),
    'reg_lambda':hp.quniform('reg_lamda',0,3,1),
    'max_depth':hp.quniform('max_depth',6,12,1),
    'learning_rate': hp.uniform('learning_rate',0.001,0.05),
    'n_estimators': hp.quniform('n_estimators', 5,500,1),
    'min_child_weight': hp.quniform('min_child_weight',0,5,1),
    'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
    'seed':42
    }

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
# A3

In [17]:
A1.shape

(1147, 2697)

In [18]:
A_tar_y.shape

(234,)

In [19]:
def bayopt_xgb(p_g):
    internal_model = XGBClassifier(
                     objective='multi:softprob',
                     alpha=p_g['alpha'],
                     gamma=p_g['gamma'],
                     reg_lambda= p_g['reg_lambda'],
                    #  colsample_bytree= p_q['colsample_bytree'],
                     max_depth = int(p_g['max_depth']),
                     n_estimator = (p_g['n_estimators']),
                     learning_rate=p_g['learning_rate'],
                    #  min_child_weight=p_g['min_child_weight'],
                     seed =p_g['seed'],
                     )
    # evaluation = [(A2, A_val_y)]

    internal_model.fit(A1, D_tra_y,
                     eval_set = [(A2, D_val_y)],
                     eval_metric = 'mlogloss',
                     early_stopping_rounds=25,verbose=False)
    
    # pred_valid = internal_model.predict(A2)
    # score = ACC(pred_valid, A_tra_y)

    score =np.mean(cross_val_score(internal_model, A1, D_tra_y, scoring='accuracy', cv=5))
    print('Score:', score)
    return {'loss':-score, 'status':STATUS_OK}

def tune():
    trials = Trials()
    best_tune = fmin(fn=bayopt_xgb, 
                    space=p_g,
                    algo= tpe.suggest,
                    max_evals=5,
                    trials=trials)
    return best_tune


ntune = tune()
ntune['n_estimators'] =  int(ntune['n_estimators'])
ntune['max_depth'] =  int(ntune['max_depth'])
xmodel = XGBClassifier(**ntune)

Score:                                               
0.25978355800265807                                  
Score:                                                                           
0.25803303588380483                                                              
Score:                                                                           
0.2981469527245111                                                               
Score:                                                                           
0.2615264856654642                                                              
Score:                                                                          
0.26501613821910003                                                             
100%|██████████| 5/5 [01:49<00:00, 21.95s/trial, best loss: -0.2981469527245111]


In [20]:
xmodel.fit(A1, D_tra_y)

In [21]:
print('Accuracy train: ',ACC(xmodel.predict(A1),D_tra_y))
print('Accuracy validation: ',ACC(xmodel.predict(A2),D_val_y))
print('Accuracy test: ',ACC(xmodel.predict(A3),D_tar_y))

Accuracy train:  0.37401918047079336
Accuracy validation:  0.2874015748031496
Accuracy test:  0.27350427350427353


In [22]:
# A1a

In [23]:
# internal_model = XGBClassifier(
#                             objective='multi:softmax',
#                                 #  alpha=p_q['alpha'],
#                                 #  gamma=p_q['gamma'],
#                                 #  reg_lambda= p_q['reg_lambda'],
#                                 #  colsample_bytree= p_q['colsample_bytree'],
#                             # max_depth = int(p_g['max_depth']),
#                             max_depth = int(3),

#                             n_estimator = (p_g['n_estimators']),
#                             learning_rate=p_g['learning_rate'],
#                             #  min_child_weight=p_g['min_child_weight'],
#                             seed =p_g['seed'],
#                             )
# evaluation = [(A2, A_val_y)]

# internal_model.fit(A1, A_tra_y,
#                 eval_set = evaluation,
#                 eval_metric = 'mlogloss',
#                 early_stopping_rounds=25,verbose=False)
    
# pred_valid = internal_model.predict(A2)
# score = ACC(A2, A_tra_y)
#     # return pred_valid

# print('Score:', score)
# {'loss':-score, 'status':STATUS_OK}

# def tune():
#     trials = Trials()
#     best_tune = fmin(fn=internal_model, 
#                     space=p_g,
#                     algo= tpe.suggest,
#                     max_evals=200,
#                     trials=trials)
#     return best_tune


# ntune = tune()
# ntune['n_estimators'] =  int(ntune['n_estimators'])
# ntune['max_depth'] =  int(ntune['max_depth'])
# # xmodel = XGBClassifier(**ntune)

In [24]:

# def cvscore():
#     ntune = tune()
#     ntune['n_estimators'] =  int(ntune['n_estimators'])
#     ntune['max_depth'] =  int(ntune['max_depth'])
#     xmodel = XGBClassifier(**ntune, random_state=42)
#     cvs = cross_val_score(xmodel, A1, Y1, cv=25,
#                          random_state=42)
#     cvs.predict
#     return cvs.mean()

<h3> 3.2 RNN on dataset B </h3>
<p> Dataset B contains embeddings(?). This, I need to read myself up on.</p>

In [25]:
from numpy import array
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten,Embedding,Dense

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model

In [26]:
B1 = pd.read_csv('data/train_df.csv')
B2= pd.read_csv('data/val_df.csv')
B3 = pd.read_csv('data/test_df.csv')

In [27]:
B1 = common_label_removal(B1).reset_index(drop=True)
B2 = common_label_removal(B2).reset_index(drop=True)
B3 = common_label_removal(B3).reset_index(drop=True)

In [28]:
def quote_list(X):
    quote_list = []
    for quote in range(len(X)):
        splitted_quote =  X['dialog'][quote].split()
        sequence_list = []
        for split in range(len(splitted_quote)):
            splitted_word = splitted_quote[split]

            word_list = str()
            i=0
            while i < (len(splitted_word)):
                # print(splitted_word[i])
                if splitted_word[i].isalpha() == True:
                    word_list += splitted_word[i]
                i+=1
            sequence_list.append(word_list)
        quote_list.append(sequence_list)
    return quote_list

In [29]:
def maxlen(X):
    uni = []
    for i in range(len(quote_list)):
        for j in range(len(quote_list[i])):
            if quote_list[i][j] not in uni:
                uni.append(quote_list[i][j])
    return len(uni)

In [30]:
B1 = quote_list(B1)
B2 = quote_list(B2)
B3 = quote_list(B3)

In [34]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(B1)
B1_seq = tokenizer.texts_to_sequences(B1)
B2_seq = tokenizer.texts_to_sequences(B2)
B3_seq = tokenizer.texts_to_sequences(B3)
maxlen = max([len(seq) for seq in B1_seq])

B1_padseq = pad_sequences(B1_seq, maxlen=maxlen,padding='post')
B2_padseq = pad_sequences(B2_seq, maxlen=maxlen,padding='post')
B3_padseq = pad_sequences(B3_seq, maxlen=maxlen,padding='post')

B1y = np.eye(10)[C_tra_y]
B2y = np.eye(10)[C_val_y]
B3y = np.eye(10)[C_tar_y]

In [35]:
emb_model = Sequential([
    layers.Embedding(input_dim=2500, output_dim=15, input_length=maxlen),
    # layers.Flatten(),
    layers.LSTM(8,activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0,3),
    layers.Dense(32, activation='selu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(64, activation='gelu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(10, activation='softmax')
])

optimizer = keras.optimizers.Adam(learning_rate=0.005)
emb_model.compile(optimizer=optimizer, 
            loss='categorical_crossentropy', 
            metrics=['accuracy'])

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=35, # how many epochs to wait before stopping
    restore_best_weights=True,
)

emb_model.fit(B1_padseq,B1y, epochs=100, batch_size=30, 
        validation_data=(B2_padseq, B2y),
        callbacks=early_stopping,)

Epoch 1/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step - accuracy: 0.1211 - loss: 2.3122 - val_accuracy: 0.1299 - val_loss: 2.2466
Epoch 2/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.1169 - loss: 2.3241 - val_accuracy: 0.1299 - val_loss: 2.2378
Epoch 3/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.1275 - loss: 2.2900 - val_accuracy: 0.1693 - val_loss: 2.2303
Epoch 4/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.1106 - loss: 2.3082 - val_accuracy: 0.1496 - val_loss: 2.2368
Epoch 5/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.1352 - loss: 2.2701 - val_accuracy: 0.1496 - val_loss: 2.2372
Epoch 6/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.1356 - loss: 2.2862 - val_accuracy: 0.1299 - val_loss: 2.2430
Epoch 7/100
[1m39/39[0m [

<keras.src.callbacks.history.History at 0x20d078c4090>

In [36]:
emb_model.summary()

In [37]:
# Train accuracy
emb_model.evaluate(B1_padseq, B1y)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1199 - loss: 2.2595


[2.258237361907959, 0.12205754220485687]

In [38]:
# Validation accuracy
emb_model.evaluate(B2_padseq, B2y)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1400 - loss: 2.2245 


[2.2302918434143066, 0.16929133236408234]

In [39]:
# Test accuracy
emb_model.evaluate(B3_padseq, B3y)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1632 - loss: 2.2011 


[2.2168354988098145, 0.1367521435022354]

<p> sources </p>
<ul>
<li>https://keras.io/api/models/model/</li>
<li>https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456</li>
<li>https://www.kaggle.com/code/rajmehra03/a-detailed-explanation-of-keras-embedding-layer</li>
<li>https://medium.com/@iqra.bismi/understanding-keras-embedding-for-natural-language-processing-9f65a281b1a7</li>

</ul>

<h3> 3.3 RFC on dataset C </h3>
<p>  Dataset C contains a counter on how many times a specific word have been mentioned in a quote. </p>

In [40]:
param_grid = {
    'n_estimators': [30,35,45,55,65,75,85,95],
    'max_depth': [6,9,12,15,18,21,24,27,30],
}

acc_list = []
for n in range(len(param_grid['n_estimators'])):
    nE = param_grid['n_estimators'][n]
    for d in range(len(param_grid['max_depth'])):
        mD = param_grid['max_depth'][d]
        
        model = RandomForestClassifier(n_estimators=nE, max_depth=mD, random_state=42) 
        model.fit(C_tra_X,C_tra_y)
        X1 = model.predict(C_tra_X)
        x2 = model.predict(C_val_X)
        acc_list.append(ACC(x2, C_val_y))


In [41]:
a = pd.Series(acc_list)
np.where(a==max(a))

(array([59], dtype=int64),)

In [42]:
#ne 85
#md 24
rfc_model = RandomForestClassifier(n_estimators=55, max_depth=15,random_state=42)
rfc_model.fit(C_tra_X,C_tra_y)
predCtrain= rfc_model.predict(C_tra_X)
predCval= rfc_model.predict(C_val_X)
predCtest= rfc_model.predict(C_tar_X)

In [43]:
# Train accuracy 
ACC(predCtrain, C_tra_y)

0.5483870967741935

In [44]:
# Train accuracy 
ACC(predCval, C_val_y)

0.2952755905511811

In [45]:
# Train accuracy 
ACC(predCtest, C_tar_y)

0.3247863247863248

<h2> 4. Ensemble model </h2>
<p> The RFC contains absolutely best results therefore, they will have prioritized votes if there are ties. </p>

In [46]:
# ann_model
# emb_model
# rfc_model

In [47]:
P1 = xmodel.predict(A3)
# P1 = pp.argmax(axis=1)

pp = emb_model.predict(B3_padseq)
P2 = pp.argmax(axis=1)

P3 = rfc_model.predict(C_tar_X)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 39ms/step


In [48]:
final_preds = []
for i in range(len(P1)):
    preds =  [P1[i],P2[i],P3[i]]
    if preds[0]==preds[1]:
        ans = preds[0]
    elif preds[0]==preds[2]:
        ans= preds[0]
    elif preds[1]==preds[2]:
        ans=preds[1]
    else:
        ans = preds[2]
    final_preds.append(ans)

In [49]:
ACC(final_preds, A_tar_y)

0.32905982905982906

In [50]:
alfa = B1[1]

In [51]:
bravo = str()
for i in range(len(alfa)):
    bravo +=' '
    bravo += str(alfa[i])

In [52]:
bravo

' LookThe gateIts openingI can see a way down'

In [53]:
# Train
APtrain =  pd.DataFrame(xmodel.predict_proba(A1))
BPtrain =  pd.DataFrame(emb_model.predict(B1_padseq))
CPtrain =  pd.DataFrame(rfc_model.predict_proba(C_tra_X))
prob_train = pd.concat([APtrain, BPtrain, CPtrain], axis=1)
prob_train.columns = [i for i in range(30)]

# Validation
APval =  pd.DataFrame(xmodel.predict_proba(A2))
BPval =  pd.DataFrame(emb_model.predict(B2_padseq))
CPval =  pd.DataFrame(rfc_model.predict_proba(C_val_X))
prob_val = pd.concat([APval, BPval, CPval], axis=1)
prob_val.columns = [i for i in range(30)]

# Target
APtarget =  pd.DataFrame(xmodel.predict_proba(A3))
BPtarget =  pd.DataFrame(emb_model.predict(B3_padseq))
CPtarget =  pd.DataFrame(rfc_model.predict_proba(C_tar_X))
prob_target = pd.concat([APtarget, BPtarget, CPtarget], axis=1)
prob_target.columns = [i for i in range(30)]

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


In [54]:
# ann_model = keras.Sequential([
#     layers.Dense(32, activation='relu',input_dim=30),
#     layers.BatchNormalization(),
#     layers.Dropout(rate=0.3),
#     layers.Dense(64, activation='selu'),
#     layers.BatchNormalization(),
#     layers.Dropout(0.3),
#     layers.Dense(128, activation='relu'),
#     layers.BatchNormalization(),
#     layers.Dropout(0.3),
#     layers.Dense(254, activation='gelu'),
#     layers.BatchNormalization(),
#     layers.Dropout(0.3),
#     layers.Dense(10, activation='softmax'),
#     # layers.Dense(10)
# ])

# optimizer = keras.optimizers.Adam(learning_rate=0.03)
# ann_model.compile(optimizer=optimizer,
#               loss = 'categorical_crossentropy',
#               metrics=['accuracy']
#               )

# # early_stopping = callbacks.EarlyStopping(
# #     min_delta=0.001, # minimium amount of change to count as an improvement
# #     patience=100, # how many epochs to wait before stopping
# #     restore_best_weights=True,
# # )
# history = ann_model.fit(prob_train, Y1, 
#           validation_data= (prob_val, Y2),
#           epochs=1000, batch_size=40, 
#         #   callbacks=early_stopping,
#           verbose=1
#           )

In [55]:
# hist = pd.DataFrame(history.history)
# hist.head(1)

# plt.figure()
# plt.plot(hist['accuracy'], label='accuracy')
# plt.plot(hist['val_accuracy'], label='val_accuracy')
# plt.legend()
# plt.show()

<h3> SDG, crazy absolute shit </h3>

In [56]:
np.random.dirichlet(np.ones(10),size=1)


array([[0.07808862, 0.07166502, 0.01519242, 0.10844095, 0.13543518,
        0.03289309, 0.16695353, 0.01929295, 0.27605326, 0.09598498]])

In [57]:
APtarget

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.036821,0.014131,0.035704,0.034449,0.023710,0.752507,0.025350,0.026950,0.028595,0.021783
1,0.122892,0.049396,0.124803,0.090795,0.082879,0.077726,0.088613,0.094203,0.192551,0.076143
2,0.134706,0.051699,0.130620,0.126032,0.086741,0.081348,0.105953,0.098594,0.104615,0.079692
3,0.136510,0.052391,0.132369,0.127719,0.087902,0.082437,0.093985,0.099914,0.106015,0.080758
4,0.136510,0.052391,0.132369,0.127719,0.087902,0.082437,0.093985,0.099914,0.106015,0.080758
...,...,...,...,...,...,...,...,...,...,...
229,0.136510,0.052391,0.132369,0.127719,0.087902,0.082437,0.093985,0.099914,0.106015,0.080758
230,0.097274,0.029552,0.069860,0.110637,0.049582,0.422534,0.053013,0.055132,0.059799,0.052616
231,0.101173,0.038829,0.093381,0.122986,0.065148,0.296350,0.069656,0.074050,0.078573,0.059854
232,0.136510,0.052391,0.132369,0.127719,0.087902,0.082437,0.093985,0.099914,0.106015,0.080758


In [58]:
APtrain[0][0]

0.05811881

In [59]:
def part_A(A,B,C):
    obs_tot_list = []
    for i in range(0,len(A),1):
        ptot_list = []
        for j  in range(0,10,1):
            p1 = A[j][i]
            p2  = B[j][i]
            p3 = C[j][i]
            ptot = sum([p1,p2,p3])
            ptot_list.append(ptot)
        obs_tot_list.append(ptot_list)
    return obs_tot_list
    

In [64]:
def part_B(P, target, weigths):

    label_list = []
    for i in range(len(P)):
        prob = np.diag(P[i]) @ weigths
        label = np.argmax(prob)
        label_list.append(label)
    score = ACC(label_list, target)
    return [score, weigths]

In [61]:
# allprob_train = part_A(APtrain, BPtrain, CPtrain)
# # part_B(allprob_train, A_tra_y )
# allprob_val = part_A(APval, BPval, CPval)
# # part_B(allprob_train, A_tra_y )

In [91]:
def part_C():    
    score_df = pd.DataFrame()

    scoresTrain = []
    scoresVal = []
    weigths = []
    for i in range(2000):
        w = np.random.dirichlet(np.ones(10),size=1).T

        first = part_A(APtrain, BPtrain, CPtrain)
        second = part_B(first, A_tra_y, w )

        third = part_A(APval, BPval, CPval)
        fourth = part_B(third, C_val_y, w)
        scoresTrain.append(second[0])
        scoresVal.append(fourth[0])
        weigths.append(second[1])
    score_df['scoresTrain'] = scoresTrain
    score_df['scoresVal'] = scoresVal
    score_df['weigths'] = weigths   
    return score_df

In [92]:
wackatron = part_C()

In [93]:
wackatron

Unnamed: 0,scoresTrain,scoresVal,weigths
0,0.292938,0.212598,"[[0.06043969077104463], [0.08811177583527424],..."
1,0.168265,0.141732,"[[0.0003689754779390481], [0.06717935432971474..."
2,0.248474,0.161417,"[[0.030517391047784146], [0.12405030884601496]..."
3,0.256321,0.216535,"[[0.05001024369543752], [0.017324734218387657]..."
4,0.260680,0.216535,"[[0.20082948732414055], [0.13653597147948449],..."
...,...,...,...
1995,0.170009,0.200787,"[[0.0943959613593153], [0.11954826756939178], ..."
1996,0.217088,0.232283,"[[0.09674354031027756], [0.0074406696083373644..."
1997,0.251962,0.208661,"[[0.24304932510738453], [0.015609636985957229]..."
1998,0.193548,0.185039,"[[0.046371006192603384], [0.3382541338424892],..."


In [94]:
wackatron['scoreCommon'] = (wackatron['scoresTrain']+wackatron['scoresVal'])/2

In [95]:
np.where(wackatron['scoreCommon'] == max(wackatron['scoreCommon']))

(array([1656], dtype=int64),)

In [100]:
wackatron.iloc[1656]

scoresTrain                                             0.452485
scoresVal                                               0.279528
weigths        [[0.1217331983814923], [0.08906155716844878], ...
scoreCommon                                             0.366006
Name: 1656, dtype: object

In [101]:
hotel = part_A(APtarget,BPtarget,CPtarget)

In [102]:
part_B(hotel, A_tar_y, wackatron['weigths'][1])

[0.18803418803418803,
 array([[3.68975478e-04],
        [6.71793543e-02],
        [3.84464539e-01],
        [3.06313844e-03],
        [2.89104542e-01],
        [7.05471019e-02],
        [7.64320377e-02],
        [1.03942642e-01],
        [4.62374962e-03],
        [2.73918634e-04]])]

In [None]:
np.diag(ptot_list).shape

(10, 10)

<h1> 5. Conclusion: </h1>
<p> We have used three different datasets trained on three different models. The best individual model is the random forest classifier, which is trained on dummy coded BoW. </p>
<br>
<p> Furthermore, all the models have been put together in an ensemble model, where the majority class wins. The accuracy of the ensemble model is equal to the accuracy retrieved from the rfc model. This might indicate that there are no documents where the two other models agrees upon another label than the rfc model. In other words; the other models are do not give any type of additional explanatory power other what than the rfc model gives.</p>
<br>
<p> The upside of the modelling phase is that we have been able to create a model that is better than random guessing by 300% and a model that better than guessing Frodo all the time by approximately 100%. </p>
<br>
<h1> Biological hazard have left the building at 01:55.  </h1>