# Hyperparameter optimalizations

Imports and Data Preparation

In [37]:
from numpy.random import seed
seed(1)

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords

from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import tensorflow as tf
from tensorflow.keras.layers import LSTM
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import StandardScaler

import pickle

nltk.download('punkt')
nltk.download('stopwords')
#np.random.seed(1)
random_state = 7

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv('/content/drive/My Drive/Author_identification/train.csv')
test = pd.read_csv('/content/drive/My Drive/Author_identification/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
train['sentences'] = train.text.transform(lambda x: len(sent_tokenize(x)))
train['words'] = train.text.transform(lambda x: len(word_tokenize(x)))
train['text_length'] = train.text.transform(lambda x: len(x))

text_info = train.groupby("author")[['sentences','words','text_length']].sum()
text_info

Unnamed: 0_level_0,sentences,words,text_length
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EAP,8206,232184,1123585
HPL,5876,173979,878178
MWS,6128,188824,916632


In [40]:
stop_words = set(stopwords.words('english')).union(set([',','.','?','!',':',';',"'",'"','-',"''","`","``"]))
author_dict = {'EAP': 0, 'HPL': 1, 'MWS': 2}

In [41]:
new_text = []
new_author = []
for i, row in train[['text','author']].iterrows():
    word_tokens = word_tokenize(row['text'])
    new_text.append(" ".join([w for w in word_tokens if not w in stop_words]))
    new_author.append(author_dict[row['author']])
new_train = pd.DataFrame(data={'text': new_text, 'author': new_author})

In [42]:
new_test_text = []
for i, row in test[['text']].iterrows():
    word_tokens = word_tokenize(row['text'])
    new_test_text.append(" ".join([w for w in word_tokens if not w in stop_words]))
new_test = pd.DataFrame(data={'text': new_text})

In [43]:
X = new_train['text']
y = new_train['author']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=random_state)
X_test = new_test['text']

In [44]:
new_text_ws = []
for i, row in train[['text']].iterrows():
    word_tokens = word_tokenize(row['text'])
    new_text_ws.append(" ".join([w for w in word_tokens if not w in [',','.','?','!',':',';',"'",'"','-',"''","`","``"]]).lower())
new_train_ws = pd.DataFrame(data={'text': new_text_ws, 'author': new_author})

In [45]:
X_ws = new_train_ws['text']
y_ws = new_train_ws['author']
X_ws_train, X_ws_valid, y_ws_train, y_ws_valid = train_test_split(X_ws, y_ws, test_size=0.3, random_state=random_state)

In [48]:
X_train_list = list(X_train)
y_train_list = list(y_train)
X_valid_list = list(X_valid)
y_valid_list = list(y_valid)

In [49]:
X_ws_train_list = list(X_ws_train)
y_ws_train_list = list(y_ws_train)
X_ws_valid_list = list(X_ws_valid)
y_ws_valid_list = list(y_ws_valid)

In [50]:
y_ws_train_list = np.array(y_ws_train_list)
y_ws_valid_list = np.array(y_ws_valid_list)

### Pre-trained fastText vectors

In [None]:
#import gensim.downloader

In [None]:
#fasttext_vectors = gensim.downloader.load('fasttext-wiki-news-subwords-300')

In [None]:
#with open('fasttext_vectors.pickle', 'wb') as f:
#    pickle.dump(fasttext_vectors, f)

In [56]:
with open("/content/drive/MyDrive/Author_identification/fasttext_vectors.pickle", 'rb') as f:
    fasttext_vectors = pickle.load(f)

In each entry we collect the wordvectors of the tokens and take the mean of those (300 dimensional vectors).

In [58]:
X_ws_train_ft_mean_vectors = np.empty([len(X_ws_train_list), 300])
for j in range(len(X_ws_train_list)):
    X_ws_train_ft_vectors = np.empty([len(X_ws_train_list[j].split()), 300])
    for i in range(len(X_ws_train_list[j].split())):
        try:
            X_ws_train_ft_vectors[i] = fasttext_vectors.wv[X_ws_train_list[j].split()[i]]
        except KeyError:
            X_ws_train_ft_vectors[i] = np.zeros(300)
    X_ws_train_ft_mean_vectors[j] = np.mean(X_ws_train_ft_vectors, axis=0)

  


In [60]:
scaler = StandardScaler()
X_ws_train_ft_mean_vectors = scaler.fit_transform(X_ws_train_ft_mean_vectors)

In [61]:
X_ws_valid_ft_mean_vectors = np.empty([len(X_ws_valid_list), 300])
for j in range(len(X_ws_valid_list)):
    X_ws_valid_ft_vectors = np.empty([len(X_ws_valid_list[j].split()), 300])
    for i in range(len(X_ws_valid_list[j].split())):
        try:
            X_ws_valid_ft_vectors[i] = fasttext_vectors.wv[X_ws_valid_list[j].split()[i]]
        except KeyError:
            X_ws_valid_ft_vectors[i] = np.zeros(300)
    X_ws_valid_ft_mean_vectors[j] = np.mean(X_ws_valid_ft_vectors, axis=0)

  


In [62]:
scaler = StandardScaler()
X_ws_valid_ft_mean_vectors = scaler.fit_transform(X_ws_valid_ft_mean_vectors)

In [64]:
from keras.utils import to_categorical
y_ws_train_list = to_categorical(y_ws_train_list)
y_ws_valid_list = to_categorical(y_ws_valid_list)

### Hyperparameter optimalization with fasttext vectors

In [66]:
import keras
from keras.datasets import fashion_mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import EarlyStopping
import numpy as np

In [67]:
data = X_ws_train_ft_mean_vectors, y_ws_train_list, X_ws_valid_ft_mean_vectors, y_ws_valid_list

In [69]:
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.layers.recurrent import LSTM
from keras.layers.normalization import BatchNormalization

In [71]:
with open('dataset1.npy', 'wb') as f:
    np.save(f, X_ws_train_ft_mean_vectors)
    np.save(f, y_ws_train_list)
    np.save(f, X_ws_valid_ft_mean_vectors)
    np.save(f, y_ws_valid_list)

with open('dataset1.npy', 'rb') as f:
    X_train = np.load(f)
    y_train = np.load(f)
    X_valid = np.load(f)
    y_valid = np.load(f)

In [72]:
def data():
  with open('dataset1.npy', 'rb') as f:
    x_train = np.load(f)
    y_train = np.load(f)
    x_test = np.load(f)
    y_test = np.load(f)
  
  return x_train, y_train, x_test, y_test

In [73]:
def create_model(x_train, y_train, x_test, y_test):
    from keras.layers import Layer
    from keras import backend as K

    class Swish(Layer):
        def __init__(self, beta, **kwargs):
            super(Swish, self).__init__(**kwargs)
            self.beta = K.cast_to_floatx(beta)

        def call(self, inputs):
            return K.sigmoid(self.beta * inputs) * inputs

        def get_config(self):
            config = {'beta': float(self.beta)}
            base_config = super(Swish, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

        def compute_output_shape(self, input_shape):
            return input_shape
    # swish definiton end
    
    # our hyperarameters
    n_layer1 = {{choice([128, 256, 512])}}
    n_layer2 = {{choice([128, 256, 512])}}
    n_layer3 = {{choice([128, 256, 512])}}
    dropout_1 = {{uniform(0, 0.5)}}
    dropout_2 = {{uniform(0, 0.5)}}
    dropout_3 = {{uniform(0, 0.5)}}
    act = {{choice(['relu', 'leakyrelu', 'swish'])}}
    optim = {{choice(['rmsprop', 'adam', 'sgd'])}}
    n_batch = {{choice([64, 128, 256])}}
    print('a modell hiperparaméterei: ', n_layer1, n_layer2, n_layer3, dropout_1, dropout_2, dropout_3, act, optim, n_batch)
    
    if act == 'relu':
        activation = keras.layers.ReLU()
    elif act == 'leakyrelu':
        activation = keras.layers.LeakyReLU()
    elif act == 'swish':
        activation = Swish(beta=0.3)
    
    model = Sequential()
    model.add(Dense(n_layer1, input_dim=300))
    model.add(activation)
    model.add(Dropout(dropout_1))
    model.add(BatchNormalization())

    model.add(Dense(n_layer2))
    model.add(activation)
    model.add(Dropout(dropout_2))
    model.add(BatchNormalization())

    model.add(Dense(n_layer3))
    model.add(activation)
    model.add(Dropout(dropout_3))
    model.add(BatchNormalization())

    model.add(Dense(3, activation='softmax'))
    
    model.compile(optimizer=optim,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    callbacks = [EarlyStopping(monitor='val_accuracy', patience=3, verbose=0)]
    
    result = model.fit(x_train, y_train,
              batch_size=n_batch,
              epochs=100,
              verbose=2,
              validation_data=(x_test, y_test),
              callbacks=callbacks,
              shuffle=True)

    best_val_acc = np.amax(result.history['val_accuracy']) 
    print('legjobb val_acc:', best_val_acc)
    
    with open('hyperopt-log.csv', 'a') as csv_file:
      csv_file.write(str(n_layer1) + ';')
      csv_file.write(str(n_layer2) + ';')
      csv_file.write(str(n_layer3) + ';')
      csv_file.write(str(dropout_1) + ';')
      csv_file.write(str(dropout_2) + ';')
      csv_file.write(str(dropout_3) + ';')
      csv_file.write(str(act) + ';')
      csv_file.write(str(optim) + ';')
      csv_file.write(str(n_batch) + ';')
      csv_file.write(str(best_val_acc) + '\n')

    return {'loss': -best_val_acc, 'status': STATUS_OK, 'model': model}

In [74]:
with open('hyperopt-log.csv', 'w') as csv_file:
  csv_file.write('n_layer1' + ';')
  csv_file.write('n_layer2' + ';')
  csv_file.write('n_layer3' + ';')
  csv_file.write('dropout_1' + ';')
  csv_file.write('dropout_2' + ';')
  csv_file.write('dropout_3' + ';')
  csv_file.write('act' + ';')
  csv_file.write('optim' + ';')
  csv_file.write('n_batch' + ';')
  csv_file.write('best_val_acc' + '\n')

#Notebook download, and upload to colab

In [None]:
!pip3 install hyperas
!pip3 install hyperopt

In [76]:
import hyperas
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform

In [77]:
best_run, best_model = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=100,
                                          notebook_name='Hyperparameter_optimalization',
                                          trials=Trials())

[1;30;43mA streamkimeneten csak az utolsó 5000 sor látható.[0m
Epoch 9/100
108/108 - 1s - loss: 0.5331 - accuracy: 0.7785 - val_loss: 0.6190 - val_accuracy: 0.7497

Epoch 10/100
108/108 - 1s - loss: 0.5159 - accuracy: 0.7888 - val_loss: 0.6218 - val_accuracy: 0.7450

Epoch 11/100
108/108 - 1s - loss: 0.4983 - accuracy: 0.7912 - val_loss: 0.6230 - val_accuracy: 0.7566

Epoch 12/100
108/108 - 1s - loss: 0.4833 - accuracy: 0.8028 - val_loss: 0.6177 - val_accuracy: 0.7530

Epoch 13/100
108/108 - 1s - loss: 0.4606 - accuracy: 0.8104 - val_loss: 0.6314 - val_accuracy: 0.7494

Epoch 14/100
108/108 - 1s - loss: 0.4492 - accuracy: 0.8176 - val_loss: 0.6199 - val_accuracy: 0.7630

Epoch 15/100
108/108 - 1s - loss: 0.4317 - accuracy: 0.8268 - val_loss: 0.6364 - val_accuracy: 0.7567

Epoch 16/100
108/108 - 1s - loss: 0.4141 - accuracy: 0.8323 - val_loss: 0.6502 - val_accuracy: 0.7549

Epoch 17/100
108/108 - 1s - loss: 0.3950 - accuracy: 0.8385 - val_loss: 0.6642 - val_accuracy: 0.7584

legjobb v

In [78]:
x_train, y_train, x_test, y_test = data()
print("best model evaluation:")
print(best_model.evaluate(x_test, y_test))
print("best hyperparameters:")
print(best_run)

legjobb modell kiértékelése:
[0.6009337902069092, 0.7640449404716492]
legjobb hiperparaméterek:
{'act': 0, 'dropout_1': 0.3593433530010136, 'dropout_1_1': 0.21880463535340797, 'dropout_1_2': 0.4911238980854753, 'n_batch': 1, 'n_layer1': 2, 'n_layer1_1': 0, 'n_layer1_2': 0, 'optim': 1}


In [79]:
best_model.save('hyperparam1_best_model.h5')

The best model of this hyperparamer optimalization giving an accuracy of 76.40%, which is our best result.

### Hyperparameter optimalization with GloVe
We can download the GloVe vectors:http://www-nlp.stanford.edu/data/glove.840B.300d.zip

In [80]:
from tqdm import tqdm

In [81]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('/content/drive/MyDrive/Author_identification/glove.840B.300d.txt')
for line in tqdm(f):
  values = line.split()
  word = values[0]
  # Catch the exception where there are strings in the Glove text file.
  try:
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = coefs
  except ValueError:
      pass
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [03:08, 11665.90it/s]


Found 2195884 word vectors.


In [82]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()#.decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [83]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(X_ws_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(X_ws_valid)]

100%|██████████| 13705/13705 [00:02<00:00, 4590.93it/s]
100%|██████████| 5874/5874 [00:01<00:00, 4590.93it/s]


In [84]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [85]:
from sklearn.preprocessing import StandardScaler

In [86]:
# scale the data before any neural net:
scl = StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [87]:
with open('dataset2.npy', 'wb') as f:
    np.save(f, xtrain_glove_scl)
    np.save(f, y_ws_train_list)
    np.save(f, xvalid_glove_scl)
    np.save(f, y_ws_valid_list)

with open('dataset2.npy', 'rb') as f:
    X_train = np.load(f)
    y_train = np.load(f)
    X_valid = np.load(f)
    y_valid = np.load(f)

In [88]:
def data2():
  with open('dataset2.npy', 'rb') as f:
    x_train = np.load(f)
    y_train = np.load(f)
    x_test = np.load(f)
    y_test = np.load(f)
  
  return x_train, y_train, x_test, y_test

In [91]:
def create_model2(x_train, y_train, x_test, y_test):
    from keras.layers import Layer
    from keras import backend as K

    class Swish(Layer):
        def __init__(self, beta, **kwargs):
            super(Swish, self).__init__(**kwargs)
            self.beta = K.cast_to_floatx(beta)

        def call(self, inputs):
            return K.sigmoid(self.beta * inputs) * inputs

        def get_config(self):
            config = {'beta': float(self.beta)}
            base_config = super(Swish, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

        def compute_output_shape(self, input_shape):
            return input_shape
    # swish end
    
    # our hyperparameters
    n_layer1 = {{choice([128, 256, 512])}}
    n_layer2 = {{choice([128, 256, 512])}}
    n_layer3 = {{choice([128, 256, 512])}}
    dropout_1 = {{uniform(0, 0.5)}}
    dropout_2 = {{uniform(0, 0.5)}}
    dropout_3 = {{uniform(0, 0.5)}}
    act = {{choice(['relu', 'leakyrelu', 'swish'])}}
    optim = {{choice(['rmsprop', 'adam', 'sgd'])}}
    n_batch = {{choice([64, 128, 256])}}
    print('a modell hiperparaméterei: ', n_layer1, n_layer2, n_layer3, dropout_1, dropout_2, dropout_3, act, optim, n_batch)

    if act == 'relu':
        activation = keras.layers.ReLU()
    elif act == 'leakyrelu':
        activation = keras.layers.LeakyReLU()
    elif act == 'swish':
        activation = Swish(beta=0.3)
    
    model = Sequential()
    model.add(Dense(n_layer1, input_dim=300))
    model.add(activation)
    model.add(Dropout(dropout_1))
    model.add(BatchNormalization())

    model.add(Dense(n_layer2))
    model.add(activation)
    model.add(Dropout(dropout_2))
    model.add(BatchNormalization())

    model.add(Dense(n_layer3))
    model.add(activation)
    model.add(Dropout(dropout_3))
    model.add(BatchNormalization())

    model.add(Dense(3, activation='softmax'))
    
    model.compile(optimizer=optim,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    callbacks = [EarlyStopping(monitor='val_accuracy', patience=3, verbose=0)]
    
    result = model.fit(x_train, y_train,
              batch_size=n_batch,
              epochs=100,
              verbose=2,
              validation_data=(x_test, y_test),
              callbacks=callbacks,
              shuffle=True)

    best_val_acc = np.amax(result.history['val_accuracy']) 
    print('legjobb val_acc:', best_val_acc)

    with open('hyperopt-log.csv', 'a') as csv_file:
      csv_file.write(str(n_layer1) + ';')
      csv_file.write(str(n_layer2) + ';')
      csv_file.write(str(n_layer3) + ';')
      csv_file.write(str(dropout_1) + ';')
      csv_file.write(str(dropout_2) + ';')
      csv_file.write(str(dropout_3) + ';')
      csv_file.write(str(act) + ';')
      csv_file.write(str(optim) + ';')
      csv_file.write(str(n_batch) + ';')
      csv_file.write(str(best_val_acc) + '\n')

    return {'loss': -best_val_acc, 'status': STATUS_OK, 'model': model}

In [92]:
with open('hyperopt-log.csv', 'w') as csv_file:
  csv_file.write('n_layer1' + ';')
  csv_file.write('n_layer2' + ';')
  csv_file.write('n_layer3' + ';')
  csv_file.write('dropout_1' + ';')
  csv_file.write('dropout_2' + ';')
  csv_file.write('dropout_3' + ';')
  csv_file.write('act' + ';')
  csv_file.write('optim' + ';')
  csv_file.write('n_batch' + ';')
  csv_file.write('best_val_acc' + '\n')

# Notebook download and upload to colab

In [None]:
!pip3 install hyperas
!pip3 install hyperopt

In [94]:
import hyperas
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform

In [96]:
best_run, best_model = optim.minimize(model=create_model2,
                                          data=data2,
                                          algo=tpe.suggest,
                                          max_evals=100,
                                          notebook_name='Hyperparameter_optimalization',
                                          trials=Trials())

[1;30;43mA streamkimeneten csak az utolsó 5000 sor látható.[0m
128
0.4892838288723406
0.22720001177701898
0.032150814859929766
relu
adam
64
Epoch 1/100
215/215 - 1s - loss: 0.9904 - accuracy: 0.5668 - val_loss: 0.7423 - val_accuracy: 0.6869

Epoch 2/100
215/215 - 1s - loss: 0.7981 - accuracy: 0.6544 - val_loss: 0.6964 - val_accuracy: 0.7036

Epoch 3/100
215/215 - 1s - loss: 0.7247 - accuracy: 0.6920 - val_loss: 0.6693 - val_accuracy: 0.7193

Epoch 4/100
215/215 - 1s - loss: 0.6906 - accuracy: 0.7089 - val_loss: 0.6641 - val_accuracy: 0.7206

Epoch 5/100
215/215 - 1s - loss: 0.6692 - accuracy: 0.7184 - val_loss: 0.6454 - val_accuracy: 0.7276

Epoch 6/100
215/215 - 1s - loss: 0.6494 - accuracy: 0.7316 - val_loss: 0.6366 - val_accuracy: 0.7329

Epoch 7/100
215/215 - 1s - loss: 0.6249 - accuracy: 0.7380 - val_loss: 0.6287 - val_accuracy: 0.7377

Epoch 8/100
215/215 - 1s - loss: 0.6137 - accuracy: 0.7480 - val_loss: 0.6192 - val_accuracy: 0.7450

Epoch 9/100
215/215 - 1s - loss: 0.6011 - 

In [97]:
x_train, y_train, x_test, y_test = data2()
print("best model evaluation:")
print(best_model.evaluate(x_test, y_test))
print("best hyperparameters:")
print(best_run)

legjobb modell kiértékelése:
[0.6119480133056641, 0.7602996230125427]
legjobb hiperparaméterek:
{'act': 0, 'dropout_1': 0.4125126321857873, 'dropout_1_1': 0.2548977255951299, 'dropout_1_2': 0.47420624738623013, 'n_batch': 0, 'n_layer1': 2, 'n_layer1_1': 1, 'n_layer1_2': 0, 'optim': 1}


In [None]:
best_model.save('hyperparam2_best_model.h5')

The best model of this hyperparamer optimalization giving an accuracy of 76.03%. This model used the GloVe data.