**LSTM**

Set-Up

In [None]:
#Only do once
!git clone https://github.com/facebookresearch/fastText.git
!cd fastText
!sudo pip install fastText

In [None]:
#Importing modules
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk
from sklearn import metrics
nltk.download('stopwords') #Downloading stopwords
import os
import random
import re
import pickle
import tensorflow as tf
from datetime import datetime
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, LSTM, Bidirectional, Dense, TimeDistributed
from tensorflow.keras.layers import Embedding, Flatten
from tensorflow.keras.layers import MaxPooling1D, Dropout, Activation, Conv1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

In [None]:
#Loading dataset
df = pd.read_csv("/content/drive/My Drive/Data/dataset_groupedempathylevel.csv")
df.drop(df[df['length']<=3].index, inplace = True)  #droping all rows that are smaller/equal 3 in length
columns_to_keep = ['text','classID', 'f_4', 'f_5'] #dropping the rest
df = df[columns_to_keep]
df

Data Preprocessing

In [None]:
df['text'] = df['text'].str.replace(r"[\d\.]+", "").str.strip() #Removing digits
df['text'] = df['text'].str.replace("[^\w\s]", "").str.lower() #Converting to lower case
german_stop_words = nltk.corpus.stopwords.words('german') #List of german stopwords
df['text'] = df['text'].apply(lambda x: ' '.join([item for item in x.split() if item not in german_stop_words])) #Removing stop words

# Converting categorical labels to numerical values
df["fn_4"] = df["f_4"].astype('category').cat.codes
df["fn_5"] = df["f_5"].astype('category').cat.codes

df

In [None]:
#Initializing parameters
CURR_PATH = !pwd
PATH_DATA = CURR_PATH[0]
PATH_MODELS = PATH_DATA + "/content/drive/My Drive/Data/LSTM/saved models"
PATH_CHECKPOINTS = PATH_MODELS + "checkpoints/"

MAX_FEATURES = 9358
EMBED_DIM = 300
MAXLEN = 302

#Training
BATCH_SIZE = 8
EPOCHS = 3


Splitting the dataset

In [None]:
train, test = train_test_split(df, random_state=1, test_size=0.10, shuffle=True)
X_train = np.array(train["text"])
Y_train_f4 = np.array(train["fn_4"]).reshape((-1, 1))
Y_train_f5 = np.array(train["fn_5"]).reshape((-1, 1))
X_test = np.array(test["text"])
Y_test_f4 = np.array(test["fn_4"]).reshape((-1, 1))
Y_test_f5 = np.array(test["fn_5"]).reshape((-1, 1))
print(X_train.shape)
print(X_test.shape)

Word Embeddings

In [None]:
#OneHotEncoding
Y_train_f4 = to_categorical(Y_train_f4)
Y_test_f4 = to_categorical(Y_test_f4)
Y_train_f5 = to_categorical(Y_train_f5)
Y_test_f5 = to_categorical(Y_test_f5)

In [None]:
#Text to list of indices representing words in dict
tokenizer = Tokenizer(lower=True, split=" ", num_words=MAX_FEATURES)
tokenizer.fit_on_texts(X_train)

X_train_vec = tokenizer.texts_to_sequences(X_train)
X_test_vec = tokenizer.texts_to_sequences(X_test)

MAXLEN = max([len(x) for x in X_train_vec])
print(f"Max vector length: {MAXLEN}")

# pad with zeros for same vector length
X_train_vec = sequence.pad_sequences(X_train_vec, maxlen=MAXLEN, padding="post")
X_test_vec = sequence.pad_sequences(X_test_vec, maxlen=MAXLEN, padding="post")

Max vector length: 302


FastText

In [None]:
#Do onyl once
from gensim.models import KeyedVectors


In [None]:
#Do only once
!wget "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz"
!gzip -d cc.de.300.vec.gz


In [None]:
#Do only once
# Load Fasttext vector embeddings 
de_model = KeyedVectors.load_word2vec_format( "cc.de.300.vec")
# use pickle to dump loaded model
pickle.dump(de_model, open("/de_model.pkl", "wb"))
de_model = pickle.load(open("/de_model.pkl", "rb"))


In [None]:
#Loading pickle model
de_model = pickle.load(open("/de_model.pkl", "rb"))


Embedding Matrix


In [None]:
words_not_found = []
word_index = tokenizer.word_index
nb_words = min(MAX_FEATURES, len(word_index)) +1
# define matrix dimensions
embedding_matrix = np.zeros((nb_words, EMBED_DIM))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    try:
        embedding_vector = de_model.get_vector(word)
    except KeyError:
        embedding_vector = None
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)


# Model f_4 (emotional empathy)

In [None]:
# Define model architecture
from tensorflow.keras.layers import BatchNormalization
model_f4 = Sequential()
model_f4.add(
    Embedding(
        input_dim=nb_words,
        output_dim=EMBED_DIM,
        input_length=MAXLEN,
        weights=[embedding_matrix],
        trainable=True,
    )
)
model_f4.add(LSTM (300,return_sequences=True,dropout=0.80)) 
model_f4.add(Dense(30,activation='tanh'))
model_f4.add(Flatten())
model_f4.add(Dense(20,activation='relu'))
model_f4.add(Dense(4,activation='softmax'))
model_f4.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),#RMSprop(),
    metrics=["accuracy"],
)
model_f4.summary()

In [None]:
#Training f_4 Model

%%time
#Stop training when validation acc starts dropping and save checkpoint of model each period
now = datetime.now().strftime("%Y-%m-%d_%H%M")
#create callbacks
callbacks = [
             EarlyStopping(monitor="val_loss", verbose=1, patience=2),
             ModelCheckpoint(
                             PATH_CHECKPOINTS + now + "_Model_FT-Embed_{epoch:02d}_{val_loss:.4f}.h5",
                             monitor="val_loss",
                             save_best_only=True,
                             verbose=1,
                             ),
             ]

#Fitting the model
steps_per_epoch = int(np.floor((len(X_train_vec) / BATCH_SIZE)))
print(
      f"Model Params.\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
      f"Step p. Epoch: {steps_per_epoch}\n"
      )

hist = model_f4.fit(
                    X_train_vec,
                    Y_train_f4,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    steps_per_epoch=steps_per_epoch,
                    callbacks=callbacks,
                    validation_data=(X_test_vec, Y_test_f4),
                    )


In [None]:
#Evaluation f_4
pred = model_f4.predict(X_train_vec)
print('Accuracy of f_4 model on Training set')
print(accuracy_score(Y_train_f4.argmax(axis=1), pred.argmax(axis=1)))
print()

# Predict on test data
pred = model_f4.predict(X_test_vec)

# Show prediction metrics
print('Accuracy of f_4 model on Test set')
print(accuracy_score(Y_test_f4.argmax(axis=1), pred.argmax(axis=1)))
print()
print('Confusion Matrix')
print(confusion_matrix(Y_test_f4.argmax(axis=1), pred.argmax(axis=1)))
print()
print('Classification Report')
report = metrics.classification_report(Y_test_f4.argmax(axis=1), pred.argmax(axis=1))
print(report)


In [None]:
#Saving the model
model_f4.save('/content/drive/My Drive/Data/LSTM/saved models/emotionalempathy')

# Model f_5 (cognitive empathy)

In [None]:
# Define model architecture

model_f5 = Sequential()
model_f5.add(
    Embedding(
        input_dim=nb_words,
        output_dim=EMBED_DIM,
        input_length=MAXLEN,
        weights=[embedding_matrix],
        trainable=True,
    )
)

model_f5.add(LSTM (300,return_sequences=True,dropout=0.80)) 
model_f5.add(Dense(30,activation='tanh'))
model_f5.add(Flatten())
model_f5.add(Dense(20,activation='relu'))
model_f5.add(Dense(4,activation='softmax'))
model_f5.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),#RMSprop(),
    metrics=["accuracy"],
)
model_f5.summary()

In [None]:
#Training f_5 Model

%%time
now = datetime.now().strftime("%Y-%m-%d_%H%M")
callbacks = [
    EarlyStopping(monitor="val_loss", verbose=1, patience=2),
    ModelCheckpoint(
        PATH_CHECKPOINTS + now + "_Model_FT-Embed_{epoch:02d}_{val_loss:.4f}.h5",
        monitor="val_loss",
        save_best_only=True,
        verbose=1,
    ),
]

#Fitting the model
steps_per_epoch = int(np.floor((len(X_train_vec) / BATCH_SIZE)))
print(
    f"Model Params.\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
)

hist = model_f5.fit(
    X_train_vec,
    Y_train_f5,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    steps_per_epoch=steps_per_epoch,
    callbacks=callbacks,
    validation_data=(X_test_vec, Y_test_f5),
)


In [None]:
#Evaluation f_5
pred = model_f5.predict(X_train_vec)
print('Accuracy of f_5 model on Training set')
print(accuracy_score(Y_train_f5.argmax(axis=1), pred.argmax(axis=1)))
print()

# Predict on test data
pred = model_f5.predict(X_test_vec)

# Show prediction metrics
print('Accuracy of f_5 model on Test set')
print(accuracy_score(Y_test_f5.argmax(axis=1), pred.argmax(axis=1)))
print()
print('Confusion Matrix')
print(confusion_matrix(Y_test_f5.argmax(axis=1), pred.argmax(axis=1)))
print()
print('Classification Report')
report = metrics.classification_report(Y_test_f5.argmax(axis=1), pred.argmax(axis=1))
print(report)


In [None]:
#Saving the model
model_f5.save('/content/drive/My Drive/Data/LSTM/saved models/cognitiveempathy')

# Loading models

In [None]:
#Loading f_4
model_f4=tf.keras.models.load_model('/content/drive/My Drive/Data/LSTM/saved models/emotionalempathy')
model_f4.summary()

#Predicting
pred = model_f4.predict(X_test_vec)

print('Accuracy of f_4 model on Test set')
print(accuracy_score(Y_test_f4.argmax(axis=1), pred.argmax(axis=1)))

In [None]:
#Loading f_5
model_f5=tf.keras.models.load_model('/content/drive/My Drive/Data/LSTM/saved models/cognitiveempathy')
model_f5.summary()

#Predicting
pred = model_f5.predict(X_test_vec)

print('Accuracy of f_5 model on Test set')
print(accuracy_score(Y_test_f5.argmax(axis=1), pred.argmax(axis=1)))
