## Author Identification with Fully Connected Neural Network






### Imports and Data Preparation

In [1]:
from numpy.random import seed
seed(1)
from tensorflow.random import set_seed
set_seed(2)

import numpy as np
import pandas as pd
import keras
from keras.layers import Layer
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam

import pickle
from sklearn.preprocessing import StandardScaler

nltk.download('punkt')
nltk.download('stopwords')
#np.random.seed(1)
random_state = 7

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Reading only the train set of the data since the test set doesn't have labels and we ended up not using it at all.

In [3]:
train = pd.read_csv('/content/drive/My Drive/Author_identification/train.csv') 
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


Executing the neccessary data preparation steps on each entry of the data (punctuation removal, lowercase letters, redefining the labels as numbers).

In [4]:
author_dict = {'EAP': 0, 'HPL': 1, 'MWS': 2}

In [5]:
new_text = []
new_author = []
for i, row in train[['text','author']].iterrows():
  word_tokens = word_tokenize(row['text'])
  new_text.append((" ".join([w for w in word_tokens if not w in [',','.','?','!',':',';',"'",'"','-',"''",'``']]).lower())) # left the stopwords in, as it gave better results
  new_author.append(author_dict[row['author']])
new_train = pd.DataFrame(data={'text': new_text, 'author': new_author})

In [42]:
X = new_train['text']
y = new_train['author']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=random_state)

In [43]:
X_train_list = list(X_train)
y_train_list = list(y_train)
X_valid_list = list(X_valid)
y_valid_list = list(y_valid)

In [44]:
y_train_list = np.array(y_train_list)
y_valid_list = np.array(y_valid_list)

### Pre-trained fastText vectors

In [9]:
with open("/content/drive/MyDrive/Author_identification/fasttext_vectors.pickle", 'rb') as f:
    fasttext_vectors = pickle.load(f) #we had this downloaded, below is the code to get this file for the first time

In [10]:
#import gensim.downloader
#fasttext_vectors = gensim.downloader.load('fasttext-wiki-news-subwords-300') # roughly 1 GB file

#### Using the mean of the wordvectors in each entry to get a single vector

In [11]:
X_train_mean_vectors = np.empty([len(X_train_list), 300])
for j in range(len(X_train_list)):
  X_train_vectors = np.empty([len(X_train_list[j].split()), 300])
  for i in range(len(X_train_list[j].split())):
    try:
      X_train_vectors[i] = fasttext_vectors.wv[X_train_list[j].split()[i]]
    except KeyError:
      X_train_vectors[i] = np.zeros(300)
  X_train_mean_vectors[j] = np.mean(X_train_vectors, axis=0)

  


In [12]:
scaler = StandardScaler()
X_train_mean_vectors = scaler.fit_transform(X_train_mean_vectors)

In [13]:
X_valid_mean_vectors = np.empty([len(X_valid_list), 300])
for j in range(len(X_valid_list)):
  X_valid_vectors = np.empty([len(X_valid_list[j].split()), 300])
  for i in range(len(X_valid_list[j].split())):
    try:
      X_valid_vectors[i] = fasttext_vectors.wv[X_valid_list[j].split()[i]]
    except KeyError:
      X_valid_vectors[i] = np.zeros(300)
  X_valid_mean_vectors[j] = np.mean(X_valid_vectors, axis=0)

  


In [14]:
scaler = StandardScaler()
X_valid_mean_vectors = scaler.fit_transform(X_valid_mean_vectors)

### The model itself and training

In [45]:
from keras.utils import to_categorical
y_train_list = to_categorical(y_train_list) # one-hot encoding the labels
y_valid_list = to_categorical(y_valid_list)

In [46]:
dense_model = tf.keras.models.Sequential()
dense_model.add(tf.keras.Input(shape=(300,)))
dense_model.add(tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'))
dense_model.add(Dropout(0.4))
dense_model.add(tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'))
dense_model.add(tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'))
dense_model.add(tf.keras.layers.Dense(3, activation='softmax'))
dense_model.output_shape

(None, 3)

In [47]:
from keras.callbacks import EarlyStopping
early_stopping=EarlyStopping(monitor="val_accuracy",patience=10, verbose=1)

In [48]:
dense_model.compile(optimizer=Adam(lr=1e-3),loss='categorical_crossentropy' ,metrics=['accuracy'])

In [49]:
result = dense_model.fit(X_train_mean_vectors, y_train_list,  validation_data = (X_valid_mean_vectors, y_valid_list),
                     callbacks=[early_stopping], epochs=75)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 00026: early stopping


In [50]:
best_val_acc = np.amax(result.history['val_accuracy']) 
print('legjobb val_accuracy:', best_val_acc)

legjobb val_accuracy: 0.7563840746879578


In [51]:
score = dense_model.evaluate(X_valid_mean_vectors, y_valid_list, verbose = 0)
print("Test score: ", score[0])
print("Test accuracy: ", score[1])

Test score:  0.8099644184112549
Test accuracy:  0.7512767910957336


The most important codes were already shown at this point, going forward there are a few methods we tried with this model but eventually proved to be less accurate or didn't work to the extent we would have liked it to.

### Creating the word embeddings using minmax vectors

In [22]:
X_train_min_vectors = np.empty([len(X_train_list), 300])
for j in range(len(X_train_list)):
  X_train_vectors = np.empty([len(X_train_list[j].split()), 300])
  for i in range(len(X_train_list[j].split())):
    try:
      X_train_vectors[i] = fasttext_vectors.wv[X_train_list[j].split()[i]]
    except KeyError:
      X_train_vectors[i] = np.zeros(300)
  X_train_min_vectors[j] = X_train_vectors.min(axis=0)

  


In [23]:
X_train_max_vectors = np.empty([len(X_train_list), 300])
for j in range(len(X_train_list)):
  X_train_vectors = np.empty([len(X_train_list[j].split()), 300])
  for i in range(len(X_train_list[j].split())):
    try:
      X_train_vectors[i] = fasttext_vectors.wv[X_train_list[j].split()[i]]
    except KeyError:
      X_train_vectors[i] = np.zeros(300)
  X_train_max_vectors[j] = X_train_vectors.max(axis=0)

  


In [24]:
X_train_minmax_vectors = np.concatenate((X_train_min_vectors, X_train_max_vectors), axis=1)

In [25]:
scaler = StandardScaler()
X_train_minmax_vectors = scaler.fit_transform(X_train_minmax_vectors)

On the validation set

In [27]:
X_valid_min_vectors = np.empty([len(X_valid_list), 300])
for j in range(len(X_valid_list)):
  X_valid_vectors = np.empty([len(X_valid_list[j].split()), 300])
  for i in range(len(X_valid_list[j].split())):
    try:
      X_valid_vectors[i] = fasttext_vectors.wv[X_valid_list[j].split()[i]]
    except KeyError:
      X_valid_vectors[i] = np.zeros(300)
  X_valid_min_vectors[j] = X_valid_vectors.min(axis=0)

  


In [29]:
X_valid_max_vectors = np.empty([len(X_valid_list), 300])
for j in range(len(X_valid_list)):
  X_valid_vectors = np.empty([len(X_valid_list[j].split()), 300])
  for i in range(len(X_valid_list[j].split())):
    try:
      X_valid_vectors[i] = fasttext_vectors.wv[X_valid_list[j].split()[i]]
    except KeyError:
      X_valid_vectors[i] = np.zeros(300)
  X_valid_max_vectors[j] = X_valid_vectors.min(axis=0)

  


In [30]:
X_valid_minmax_vectors = np.concatenate((X_valid_min_vectors, X_valid_max_vectors), axis=1)

In [31]:
scaler = StandardScaler()
X_valid_minmax_vectors = scaler.fit_transform(X_valid_minmax_vectors)

In [32]:
dense_model_mm = tf.keras.models.Sequential()
dense_model_mm.add(tf.keras.Input(shape=(600,)))
dense_model_mm.add(tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'))
dense_model_mm.add(tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'))
dense_model_mm.add(tf.keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'))
dense_model_mm.add(tf.keras.layers.Dense(3, activation='softmax'))
dense_model_mm.output_shape

(None, 3)

In [33]:
dense_model_mm.compile(optimizer=Adam(lr=1e-5),loss='categorical_crossentropy' ,metrics=['accuracy'])

In [34]:
dense_model_mm.fit(X_train_minmax_vectors, y_train_list,  validation_data = (X_valid_minmax_vectors, y_valid_list),
                     callbacks=[early_stopping], epochs=75)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 00026: early stopping


<tensorflow.python.keras.callbacks.History at 0x7fce49fbfac8>

In [35]:
score = dense_model_mm.evaluate(X_valid_minmax_vectors, y_valid_list, verbose = 0)
print("Test score: ", score[0])
print("Test accuracy: ", score[1])

Test score:  1.1520824432373047
Test accuracy:  0.48995572328567505


The results are way worse interestingly than the mean vector method, even though these two are often recommended as similar solutions for the same problems.

### Applying the model for prediction

Here the model is supposed to read in any sentence and give a prediction for probabilities of the author.

In [None]:
den_model = tf.keras.models.load_model('/content/drive/MyDrive/Author_identification/author_identification_dense_model.h5')

In [None]:
reverse_author_dict = {0: 'Edgar Allan Poe', 1: 'HP Lovecraft', 2: 'Mary Shelley'}

In the following function we first do the same data manipulation steps as before the actual training. Then predict an author from that input.

In [None]:
def who_wrote(sentence):
  word_tokens = word_tokenize(sentence)
  sentence_tok = ((" ".join([w for w in word_tokens if not w in [',','.','?','!',':',';',"'",'"','-',"''",'``']]).lower()))
  mean_vector = np.empty(300)
  ft_vectors = np.empty([len(sentence_tok.split()), 300])
  for i in range(len(sentence_tok.split())):
    try:
      ft_vectors[i] = fasttext_vectors.wv[sentence_tok.split()[i]]
    except KeyError:
      ft_vectors[i] = np.zeros(300)
  mean_vector = np.mean(ft_vectors, axis=0)
  mean_vector = mean_vector.reshape(1, -1)
  #scaler = StandardScaler()
  #mean_vector = scaler.fit_transform(mean_vector)
  result = den_model.predict(mean_vector)
  print(', '.join([reverse_author_dict[i] + ': ' + str(round(result[0][i] * 100,2)) + '%' for i in range(3)]))
  print('Result: ', reverse_author_dict[np.argmax(result)])

In [None]:
who_wrote('If a fire wanted fanning, it could readily be fanned with a newspaper.') #this is a sentence from Edgar Allen Poe

In [None]:
who_wrote('What though their hireling Greaser bands') #this is from HP Lovecraft

Here there was an error somewhere definitely since the model shouldn't predict pretty much the same percentages for every sentence and we couldn't find a fix for this before the deadline.

In [None]:
who_wrote('Cthulhu')