# LSTM classifier

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
  
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
  
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Convolution1D
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/congningni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/congningni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import pandas as pd
import re
import gensim
import random
import string
import swifter

In [3]:
from sklearn.ensemble import StackingClassifier
import sklearn.metrics as mtc
from sklearn.model_selection import StratifiedShuffleSplit
from nltk.corpus import stopwords
import numpy as np

## Load Dataset

In [4]:
df_train = pd.read_csv('./Dataset/training_sep09.csv')
df_test = pd.read_csv('./Dataset/testing_sep09.csv')

In [5]:
df_train['Content_list'] = df_train.Content.swifter.apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
df_train['Content_list'] = df_train.Content_list.str.split(' ')

df_test['Content_list'] = df_test.Content.swifter.apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
df_test['Content_list'] = df_test.Content_list.str.split(' ')

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=53924.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=13631.0, style=ProgressStyle(descripti…




In [6]:
df_train.columns

Index(['Label', 'Content', 'Content_list'], dtype='object')

In [7]:
df_content_all = df_train.append(df_test[['Label', 'Content', 'Content_list']])

In [8]:
MAX_FEATURES = 50000
EMBED_SIZE = 128
RNN_CELL_SIZE = 128
MAX_LEN = 200  

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(df_content_all.Content_list)
list_tokenized_train = tokenizer.texts_to_sequences(df_train.Content_list)
list_tokenized_test = tokenizer.texts_to_sequences(df_test.Content_list)

X_train = pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
y_train = pd.get_dummies(df_train.Label).values

X_test = pad_sequences(list_tokenized_test, maxlen=MAX_LEN)
y_test = pd.get_dummies(df_test.Label).values

## Functions

In [10]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
          
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = tf.nn.tanh(
            self.W1(features) + self.W2(hidden_with_time_axis))
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
          
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [16]:
#reset the model

BATCH_SIZE = 128
EPOCHS = 5

sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)

lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)

# Getting our LSTM outputs
(lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences=True, return_state=True), 
                                                                     name="bi_lstm_1")(lstm)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
context_vector, attention_weights = Attention(10)(lstm, state_h)
dropout1 = Dropout(0.2)(context_vector)
dense1 = Dense(20, activation="relu")(dropout1)
dropout2 = Dropout(0.2)(dense1)
output = Dense(3, activation="softmax")(dropout2)

#model = keras.Model(inputs=sequence_input, outputs=output)

METRICS = [
keras.metrics.TruePositives(name='tp'),
keras.metrics.FalsePositives(name='fp'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.FalseNegatives(name='fn'),
keras.metrics.BinaryAccuracy(name='accuracy'),
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.AUC(name='auc'),
]
###


model = keras.Model(inputs=sequence_input, outputs=output)


In [17]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 200, 128)     6400000     input_4[0][0]                    
__________________________________________________________________________________________________
bi_lstm_0 (Bidirectional)       (None, 200, 256)     263168      embedding_3[0][0]                
__________________________________________________________________________________________________
bi_lstm_1 (Bidirectional)       [(None, 200, 256), ( 394240      bi_lstm_0[0][0]                  
____________________________________________________________________________________________

In [18]:
model.compile(loss='categorical_crossentropy',
      optimizer='adam',
      metrics=METRICS)
history = model.fit(X_train,y_train,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            validation_split=0.2)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdee81d4c10>

In [20]:
logits = model.predict(X_test)
#print ('logits:',logits[:10])
probi = logits[:,1]

In [25]:
logits.argmax(axis=1).shape

(13631,)

In [26]:
lstm_df1 = pd.DataFrame(logits.argmax(axis=1), columns=['lstm_pred'])
lstm_df2 = pd.DataFrame(logits, columns=['lstm_0','lstm_1','lstm_2'])
tradition_mp = pd.concat([df_test, lstm_df1, lstm_df2], axis=1)


In [31]:
tradition_mp.to_csv('./Dataset/lstm_mp.csv',index=False)

In [34]:
def performance_evalute(frame, columns):
    
    ## split testing set
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.9, random_state=11)
    accuracy, precision, recall, f1, auc = [], [], [], [], []
    ## 90% testing 
    for dropped_index,valid_index in sss.split(frame, frame['Tag']):
        y_valid = frame['Tag'][valid_index]
        y_pred = frame[columns[0]][valid_index]
        y_pred_prob = frame[columns[1:]].iloc[valid_index].to_numpy()

        accuracy.append(mtc.accuracy_score(y_valid, y_pred))
        precision.append(mtc.precision_score(y_valid, y_pred, average='weighted'))
        recall.append(mtc.recall_score(y_valid, y_pred, average='weighted'))
        f1.append(mtc.f1_score(y_valid, y_pred, average='weighted'))
        auc.append(mtc.roc_auc_score(y_valid, y_pred_prob,multi_class='ovo'))
        
    return accuracy, precision, recall, f1, auc


def label_posts(character):
    if character == 'I am their child':
        return 0
    elif character == 'I am their partner or spouse':
        return 1
    elif character == 'Other':
        return 2
    else:
        print ('error')

In [36]:
tradition_mp['Tag'] = tradition_mp.Label.swifter.apply(lambda x: label_posts(x))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=13631.0, style=ProgressStyle(descripti…




In [40]:
author_list = tradition_mp[['user_id','Tag']].drop_duplicates().reset_index(drop=True)
author_list = author_list.join(tradition_mp.groupby('user_id').apply(lambda x: np.mean(x[['lstm_0','lstm_1','lstm_2']])), how='left',on='user_id')
#author_list = author_list.join(tradition_mp.groupby('user_id').apply(lambda x: np.mean(x[['RF_0','RF_1','RF_2']])), how='left',on='user_id')
#author_list = author_list.join(tradition_mp.groupby('user_id').apply(lambda x: np.mean(x[['KNN_0','KNN_1','KNN_2']])), how='left',on='user_id')

author_list['lstm_pred'] = author_list.swifter.apply(lambda x: x[['lstm_0','lstm_1','lstm_2']].argmax(),axis=1)

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=32.0, style=ProgressStyle(description_wi…




In [42]:
def performance_evalute_seperate(frame, columns):
    
    ## split testing set
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.9, random_state=11)
    accuracy, precision, recall, f1, auc = [], [], [], [], []
    pre_all, rec_all, f1_all = [],[],[]
    ## 90% testing 
    for dropped_index,valid_index in sss.split(frame, frame['Tag']):
        y_valid = frame['Tag'][valid_index]
        y_pred = frame[columns[0]][valid_index]
        y_pred_prob = frame[columns[1:]].iloc[valid_index].to_numpy()

        accuracy.append(mtc.accuracy_score(y_valid, y_pred))
        precision.append(mtc.precision_score(y_valid, y_pred, average='weighted'))
        recall.append(mtc.recall_score(y_valid, y_pred, average='weighted'))
        f1.append(mtc.f1_score(y_valid, y_pred, average='weighted'))
        auc.append(mtc.roc_auc_score(y_valid, y_pred_prob,multi_class='ovo'))
        
        pre_all.append(mtc.precision_score(y_valid, y_pred, average=None))
        rec_all.append(mtc.recall_score(y_valid, y_pred, average=None))
        f1_all.append(mtc.f1_score(y_valid, y_pred, average=None))

    return accuracy, precision, recall, f1, auc, pre_all, rec_all, f1_all

In [37]:
accuracy, precision, recall, f1, auc = performance_evalute(tradition_mp,['lstm_pred','lstm_0','lstm_1','lstm_2'])
print ('Accuracy:\t%.3f (%.3f)'%(np.mean(accuracy), np.std(accuracy)))
print ('Precision:\t%.3f (%.3f)'%(np.mean(precision),np.std(precision)))
print ('Recall:\t\t%.3f (%.3f)'%(np.mean(recall),np.std(recall)))
print('F1:\t\t%.3f (%.3f)'%(np.mean(f1),np.std(f1)))
print('AUC:\t\t%.3f (%.3f)'%(np.mean(auc),np.std(auc)))

Accuracy:	0.738 (0.001)
Precision:	0.725 (0.001)
Recall:		0.738 (0.001)
F1:		0.728 (0.001)
AUC:		0.821 (0.001)


In [41]:
accuracy, precision, recall, f1, auc = performance_evalute(author_list,['lstm_pred','lstm_0','lstm_1','lstm_2'])
print ('Accuracy:\t%.3f (%.3f)'%(np.mean(accuracy), np.std(accuracy)))
print ('Precision:\t%.3f (%.3f)'%(np.mean(precision),np.std(precision)))
print ('Recall:\t\t%.3f (%.3f)'%(np.mean(recall),np.std(recall)))
print('F1:\t\t%.3f (%.3f)'%(np.mean(f1),np.std(f1)))
print('AUC:\t\t%.3f (%.3f)'%(np.mean(auc),np.std(auc)))

Accuracy:	0.811 (0.002)
Precision:	0.800 (0.002)
Recall:		0.811 (0.002)
F1:		0.795 (0.002)
AUC:		0.871 (0.002)


In [43]:
accuracy, precision, recall, f1, auc, pre_all, recal_all, f1_all = performance_evalute_seperate(tradition_mp,['lstm_pred','lstm_0','lstm_1','lstm_2'])
print('Precision:',np.stack(pre_all).mean(axis=0),np.stack(pre_all).std(axis=0))
print('Recall:',np.stack(recal_all).mean(axis=0),np.stack(recal_all).std(axis=0))
print('f1:',np.stack(f1_all).mean(axis=0),np.stack(f1_all).std(axis=0))

Precision: [0.77324855 0.76052778 0.4664147 ] [0.00137166 0.00259033 0.00338791]
Recall: [0.86780909 0.67468023 0.3409734 ] [0.00142498 0.00271285 0.00219549]
f1: [0.817804   0.71503528 0.39394579] [0.00126341 0.00250939 0.00240061]


In [44]:
accuracy, precision, recall, f1, auc, pre_all, recal_all, f1_all = performance_evalute_seperate(author_list,['lstm_pred','lstm_0','lstm_1','lstm_2'])
print('Precision:',np.stack(pre_all).mean(axis=0),np.stack(pre_all).std(axis=0))
print('Recall:',np.stack(recal_all).mean(axis=0),np.stack(recal_all).std(axis=0))
print('f1:',np.stack(f1_all).mean(axis=0),np.stack(f1_all).std(axis=0))

Precision: [0.82076829 0.83939715 0.67798855] [0.00134482 0.00366352 0.00882009]
Recall: [0.94038756 0.81563307 0.36948276] [0.00172855 0.00258721 0.00481834]
f1: [0.87651493 0.82733943 0.47829164] [0.00131222 0.00239613 0.0057238 ]
