In [1]:
#Copied from fastai notebook
%reload_ext autoreload
%autoreload 2
%matplotlib inline

#Importing data packages
import pandas as pd
import numpy as np
import mlflow
from mlflow import keras
import csv

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

import numpy as np
import pandas as pd
import pickle
import tensorflow
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os

os.environ['KERAS_BACKEND']='tensorflow' 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, AveragePooling1D, SpatialDropout1D
from keras.models import Model
from keras.models import Sequential
from keras import layers
from keras.callbacks import CSVLogger, EarlyStopping
from keras.callbacks import ModelCheckpoint

In [2]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.utils.vis_utils import plot_model

##Training Models

In [4]:
def open_glove():
  text = spark.read.text("/mnt/glove.6B.100d.txt")
  text = text.toPandas()
  text_values = [str.split(str(item)) for item in text.iloc[:,0]]
  return text_values

In [5]:
def create_embedding(text_values):
  embeddings_index = {}
  for line in range(len(text_values)):
      word = text_values[line][0]
      coefs = np.asarray(text_values[line][1:], dtype='float32')
      embeddings_index[word] = coefs

  print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))
  return embeddings_index

In [6]:
def padsequence(vocab, sentences, maxwords, sequencelength):
  tokenizer = Tokenizer(num_words=maxwords)  
  tokenizer.fit_on_texts(vocab)
  sequences = tokenizer.texts_to_sequences(sentences)
  word_index = tokenizer.word_index
  sequences = pad_sequences(sequences, padding='post', maxlen=sequencelength)
  return tokenizer, sequences, word_index

In [7]:
def create_embedmatrix(word_index, embedding_index, embedding_dim):
  embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
  for word, i in word_index.items():
      embedding_vector = embedding_index.get(word)
      if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
          embedding_matrix[i] = embedding_vector[:embedding_dim]
  return embedding_matrix

In [8]:
def create_embedlayer(word_index, embedding_index, embedding_dim, max_sequence):
  embedding_matrix = create_embedmatrix(word_index, embedding_index, embedding_dim)
  
  embedding_layer = Embedding(len(word_index) + 1,
                              embedding_dim,weights=[embedding_matrix],
                              input_length=max_sequence,trainable=True)
  nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
  print("Percentage of embedding matrix with weights: " + (str(round(nonzero_elements / (len(word_index) + 1)*100))) + "%")
  
  return embedding_layer

In [9]:
def splitdata(sequences, sequencelength, metadata, labels, val_split):
  data = sequences
  labels = to_categorical(np.asarray(labels))

  indices = np.arange(data.shape[0])
  np.random.shuffle(indices)
  data = data[indices]
  labels = labels[indices]
  nb_validation_samples = int(val_split * data.shape[0])

  x_train = data[:-nb_validation_samples]
  y_train = labels[:-nb_validation_samples]
  
  x_val = data[-nb_validation_samples:]
  y_val = labels[-nb_validation_samples:]

  meta_train = metadata[:-nb_validation_samples]
  meta_val = metadata[-nb_validation_samples:]

  print('Shape of Text Data Tensor:', data.shape)
  print('Shape of Metadata Tensor:', metadata.shape)
  print('Shape of Label Tensor:', labels.shape)
  
  return x_train, y_train, x_val, y_val, meta_train, meta_val

In [10]:
def choose_model(modeltype, embedding_layer, max_sequence, ):
  if modeltype == "ANN":
    model, cp, csv_logger, earlystop = create_ANN(embedding_layer, max_sequence)
  elif modeltype == "ANN_meta":
    model, cp, csv_logger, earlystop = create_ANN_meta(embedding_layer, max_sequence)
  elif modeltype == "CNN":
    model, cp, csv_logger, earlystop = create_CNN(embedding_layer, max_sequence)
  elif modeltype == "CNN_meta":
    model, cp, csv_logger, earlystop = create_CNN_meta(embedding_layer, max_sequence)
  elif modeltype == "LSTM":
    model, cp, csv_logger, earlystop = create_LSTM_Functional(embedding_layer, max_sequence)
  elif modeltype == "LSTM_meta":
    model, cp, csv_logger, earlystop = create_LSTM_meta(embedding_layer, max_sequence)
  else:
    print("No model found. Please change model type.")
  
  return model, cp, csv_logger, earlystop

In [11]:
## ANN
def create_ANN(embedding_layer, max_sequence):
  
  sequence_input = Input(shape=(max_sequence,), dtype='int32')
  embedded_sequences = embedding_layer(sequence_input)
  dense1 = Dense(128, activation='relu')(embedded_sequences)
  l_flat = Flatten()(dense1) 
  dense2 = Dense(64, activation='relu')(l_flat)
  preds = Dense(2, activation='softmax')(dense2)

  model = Model(inputs=sequence_input, outputs=preds)
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['acc'])

  print("Simplified MLP")
  model.summary()
  
  #Create callbacks
  cp=ModelCheckpoint('model_ann.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
  csv_logger = CSVLogger('training.log')
  earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=False)
  
  return model, cp, csv_logger, earlystop

In [12]:
def create_CNN(embedding_layer, max_sequence):

  sequence_input = Input(shape=(max_sequence,), dtype='int32')
  embedded_sequences = embedding_layer(sequence_input)
  l_cov1= Conv1D(128, 10, activation='relu')(embedded_sequences)
  l_pool1 = MaxPooling1D(5)(l_cov1)
  l_cov2 = Conv1D(64, 5, activation='relu')(l_pool1)
  l_pool2 = MaxPooling1D(5)(l_cov2)

  l_flat = Flatten()(l_pool2)
  l_dense = Dense(128, activation='relu')(l_flat)
  preds = Dense(2, activation='softmax')(l_dense)

  model = Model(sequence_input, preds)
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['acc'])

  print("Simplified convolutional neural network")
  model.summary()
 
  #Create callbacks
  cp=ModelCheckpoint('model_lstm.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
  csv_logger = CSVLogger('training.log')
  earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=False)
  
  return model, cp, csv_logger, earlystop

In [13]:
#LSTM Functional
def create_LSTM(embedding_layer, max_sequence):
  
  sequence_input = Input(shape=(max_sequence,), dtype='int32')
  model = embedding_layer(sequence_input)
  model = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25, recurrent_dropout=0.1))(model)
  model = Flatten()(model)
  model = Dense(128, activation="relu")(model)
  model = Dropout(0.25)(model)
  model = Dense(2, activation="softmax")(model)

  model = Model(inputs=sequence_input, outputs=model)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()

 
  #Create callbacks
  cp=ModelCheckpoint('model_lstm.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
  csv_logger = CSVLogger('training.log')
  earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=True)
  
  return model, cp, csv_logger, earlystop

In [14]:
from keras.callbacks import CSVLogger
from keras import losses
from keras.layers import concatenate

def create_LSTM_meta(embedding_layer, max_sequence):
  
  meta_input = Input(shape=(2,), name='meta_input')
  y = Dense(32, activation='softmax')(meta_input)

  sequence_input = Input(shape=(max_sequence,), dtype='int32')
  model = embedding_layer(sequence_input)
  model = Bidirectional(LSTM(100, return_sequences=True, dropout=0.25, recurrent_dropout=0.1))(model)
  model = Flatten()(model)
  model = Dense(100, activation="relu")(model)
  model = Dropout(0.25)(model)

  x = concatenate([model, y])
  print(x)
  l_dense = Dense(50, activation='relu')(x)
  preds = Dense(2, activation='softmax')(l_dense)

  model = Model(inputs=[meta_input,sequence_input], outputs=preds)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()

 
  #Create callbacks
  cp=ModelCheckpoint('model_lstm.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
  csv_logger = CSVLogger('training.log')
  earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=True)
  
  return model, cp, csv_logger, earlystop

In [15]:
def load_data(complete_table, train_table):
  #Load dataframes
  complete_df = table_to_df(complete_table)
  complete_df = prep_df(complete_df)

  df_sentences_pos = table_to_df(train_table)
  df_sentences_pos = prep_df(df_sentences_pos)

  #Load glove file 
  text_values = open_glove()
  #Create embedding_index from glove file
  embedding_index = create_embedding(text_values)
  return complete_df, df_sentences_pos, embedding_index

In [16]:
def make_model(modeltype, complete_df, df_sentences_pos, textcolumn_complete, textcolumn_train, labelcolumn_train, meta_train_start, meta_train_stop, max_sequence, max_words, embedding_dim, val_split, embedding_index, classweight_neg, classweight_pos, epoch, glove_emblayer):
  if glove_emblayer == True:
    #Vocabulary based on glove embedding
    tokenizer, sequences, word_index = padsequence(embedding_index, df_sentences_pos.iloc[:,2], max_words, max_sequence)
  else:
    #Vocabulary based on corpus vocabulary
    tokenizer, sequences, word_index = padsequence(complete_df.iloc[:,textcolumn_complete], df_sentences_pos.iloc[:,textcolumn_train], max_words, max_sequence)

  #Make embedding layer
  embedding_layer = create_embedlayer(word_index, embedding_index, embedding_dim, max_sequence)
  
  #Split data in train and validation set
  x_train, y_train, x_val, y_val, meta_train, meta_val = splitdata(sequences, max_sequence, df_sentences_pos.iloc[:, meta_train_start:meta_train_stop], df_sentences_pos.iloc[:,labelcolumn_train], val_split)

  #Create model and callbacks
  model, cp, csv_logger, earlystop = choose_model(modeltype, embedding_layer, max_sequence)
 
  #Define classweights
  class_weight = {0: classweight_neg,
                  1: classweight_pos}
  
  #Train model
  history=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs= epoch, batch_size=8, callbacks=[cp, csv_logger, earlystop], class_weight=class_weight)
  
  return model, history, cp, 'training.log'

In [17]:
def make_model_meta(modeltype, complete_df, df_sentences_pos, textcolumn_complete, textcolumn_train, labelcolumn_train, meta_train_start, meta_train_stop, max_sequence, max_words, embedding_dim, val_split, embedding_index, classweight_neg, classweight_pos, epoch, glove_emblayer):
  if glove_emblayer == True:
    #Vocabulary based on glove embedding
    tokenizer, sequences, word_index = padsequence(embedding_index, df_sentences_pos.iloc[:,2], max_words, max_sequence)
  else:
    #Vocabulary based on corpus vocabulary
    tokenizer, sequences, word_index = padsequence(complete_df.iloc[:,textcolumn_complete], df_sentences_pos.iloc[:,textcolumn_train], max_words, max_sequence)

  #Make embedding layer
  embedding_layer = create_embedlayer(word_index, embedding_index, embedding_dim, max_sequence)
  
  #Split data in train and validation set
  x_train, y_train, x_val, y_val, meta_train, meta_val = splitdata(sequences, max_sequence, df_sentences_pos.iloc[:, meta_train_start:meta_train_stop], df_sentences_pos.iloc[:,labelcolumn_train], val_split)
  meta_train = np.array(meta_train)
  meta_val = np.array(meta_val)
  #Create model and callbacks
  model, cp, csv_logger, earlystop = choose_model(modeltype, embedding_layer, max_sequence)
 
  #Define classweights
  class_weight = {0: classweight_neg,
                  1: classweight_pos}
  #Train model
  history=model.fit(x=[meta_train, x_train], y=y_train, validation_data=([meta_val, x_val], y_val),epochs= epoch, batch_size=8, callbacks=[cp, csv_logger, earlystop], class_weight=class_weight)
  
  return model, history, cp, 'training.log'

In [18]:
from keras.callbacks import CSVLogger
from keras import losses
from keras.layers import concatenate

def create_CNN_meta(embedding_layer, max_sequence):


  meta_input = Input(shape=(2,), name='meta_input')
  y = Dense(32, activation='softmax')(meta_input)
  
  sequence_input = Input(shape=(max_sequence,), dtype='int32')
  embedded_sequences = embedding_layer(sequence_input)
  l_cov1= Conv1D(128, 10, activation='relu')(embedded_sequences)
  l_pool1 = MaxPooling1D(5)(l_cov1)
  l_cov2 = Conv1D(64, 5, activation='relu')(l_pool1)
  l_pool2 = MaxPooling1D(5)(l_cov2)
  l_flat = Flatten()(l_pool2)
  
  x = concatenate([l_flat, y])
  print(x)
  l_dense = Dense(128, activation='relu')(x)
  preds = Dense(2, activation='softmax')(l_dense)
  
  model = Model(inputs=[meta_input, sequence_input], outputs=preds)
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['acc'])

  print("Simplified convolutional neural network")
  model.summary()

  #Create callbacks
  cp=ModelCheckpoint('model_lstm.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
  csv_logger = CSVLogger('training.log')
  earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=True)
  
  return model, cp, csv_logger, earlystop

In [19]:
## ANN
def create_ANN_meta(embedding_layer, max_sequence):
  from keras.callbacks import CSVLogger
  from keras import losses
  from keras.layers import concatenate
  
  meta_input = Input(shape=(2,), name='meta_input')
  y = Dense(32, activation='softmax')(meta_input)

  sequence_input = Input(shape=(max_sequence,), dtype='int32')
  embedded_sequences = embedding_layer(sequence_input)
  dense1 = Dense(128, activation='relu')(embedded_sequences)
  l_flat = Flatten()(dense1) 
  dense2 = Dense(64, activation='relu')(l_flat)


  x = concatenate([dense2, y])
  print(x)
  l_dense = Dense(128, activation='relu')(x)
  preds = Dense(2, activation='softmax')(l_dense)

  model = Model(inputs=[meta_input, sequence_input], outputs=preds)
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['acc'])

  print("Simplified MLP")
  model.summary()
  
  #Create callbacks
  cp=ModelCheckpoint('model_ann.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
  csv_logger = CSVLogger('training.log')
  earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None, restore_best_weights=False)
  
  return model, cp, csv_logger, earlystop