# LSTM Text classification using Tensorflow 2.0 Alpha

# Importing Libraries

In [1]:
import os.path
import numpy as np
import pickle
import random

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import layers

%matplotlib inline
import matplotlib.pyplot as plt

POS_TRAIN_PATH = 'data/twitter-datasets/train_pos_full.txt' 
NEG_TRAIN_PATH = 'data/twitter-datasets/train_neg_full.txt' 
DATA_TEST_PATH = 'data/twitter-datasets/test_data.txt'
OUTPUT_PATH = 'predictions_out.csv'
TRAINING_DATA_PATH_X = 'data/training_data.npy'
TRAINING_DATA_PATH_Y = 'data/data_y.npy'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Environment checks

#### Check Tensorflow Version

In [2]:
print(tf.__version__)

2.0.0


### Import data

In [79]:
if os.path.isfile(TRAINING_DATA_PATH_X):
    train_data = np.load(TRAINING_DATA_PATH_X)
    train_y = np.load(TRAINING_DATA_PATH_Y)
else:
    embeddings = np.load('saved_gen_files/embeddings.npy')

    train_text_neg = open(NEG_TRAIN_PATH, 'r').readlines()
    train_text_pos = open(POS_TRAIN_PATH, 'r').readlines()
    # Construct the two arrays 
    train_text = np.array(train_text_neg + train_text_pos)
    train_y = np.concatenate([np.array([-1 for _ in range(len(train_text_neg))]), np.ones(len(train_text_pos))])

    with open('saved_gen_files/vocab.pkl', 'rb') as f:
        voc = pickle.load(f)

    def toAvgVec(t):

        _, K = embeddings.shape
        sum_vec = np.zeros((K))
        words = t.split()
        for word in words:
            index = voc.get(word)
            if index is not None:
                sum_vec += embeddings[index]

        return sum_vec/len(words)
    # Create numerical feature matrix of tweets
    train_data = np.zeros(len(train_text)*embeddings.shape[1]).reshape(len(train_text), 20)
    for i in range(len(train_text)):
        train_data[i] = [toAvgVec(train_text[i]), train_text[i]]
    
    np.save(TRAINING_DATA_PATH_X, train_data)
    np.save(TRAINING_DATA_PATH_Y, train_y)

indices = np.arange(train_data.shape[0])
random.shuffle(indices)

indices
X_train = train_data[indices[:2400000]]
y_train = train_y[indices[:2400000]]

X_test = train_data[2400000:]
y_test = train_y[2400000:]

ValueError: cannot copy sequence with size 2 to array axis with dimension 20

In [4]:
max_features = 200
num_features = X_test.shape[1] #Size of embedding features

print(X_train.shape)
print(X_test.shape)

(2400000, 20)
(100000, 20)


#### Normalizing classes in  [0, 1] instead of [-1, 1]

In [38]:
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0

#### Use only a small fraction of dataset to test the architecture

In [5]:
ratio = 0.05
train_size = int(X_train.shape[0] * ratio)
test_size = int(X_test.shape[0] * ratio)
X_train = X_train[:train_size]
y_train = y_train[:train_size]
X_test = X_test[:test_size]
y_test = y_test[:test_size]

print(X_train.shape)
print(X_test.shape)

(120000, 20)
(5000, 20)


In [78]:
print(test.shape)

TypeError: only size-1 arrays can be converted to Python scalars

# Define LSTM layers and parameters
- Bidirectional LSTM using CUDA GPU processing
- No embeddings used currently
- Two fully connected layers with dropout layers for reducing chances of overfit

In [75]:
model1 = Sequential()
#model1.add(Embedding(input_dim=num_features, output_dim=max_features))
#model1.add(Bidirectional(LSTM(64, return_sequences=True)))
#model1.add(GlobalMaxPool1D())
#model1.add(Dropout(0.2))
#model1.add(Dense(64, activation='softmax'))
#model1.add(Dropout(0.2))
#model1.add(Dense(32, activation='sigmoid'))
#model1.add(Dropout(0.2))
model1.add(Bidirectional(LSTM(32, return_sequences=True), input_shape=(20, 2)))
model1.add(GlobalMaxPool1D())
model1.add(Dense(10, activation='relu'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model1.summary()

Model: "sequential_38"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_28 (Bidirectio (None, None, 64)          13568     
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 64)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 10)                650       
Total params: 14,218
Trainable params: 14,218
Non-trainable params: 0
_________________________________________________________________


# Fit LSTM model 

In [76]:
#%time 
model1.fit(X_train, y_train, batch_size=512, epochs=2, validation_data=(X_test, y_test), verbose = 1)

ValueError: Error when checking input: expected bidirectional_28_input to have 3 dimensions, but got array with shape (120000, 20)

# Predict using trained model

In [35]:
pred_test_y = model1.predict([X_test], batch_size=1024, verbose=1)




# Calculate optimal probability threshold for classification
- Calculating best probability cut-off giving the highest F1 - Score

In [36]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(y_test, (pred_test_y > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1))
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

F1 score at threshold 0.1 is 0.5891262295756677
F1 score at threshold 0.11 is 0.5975954439991563
F1 score at threshold 0.12 is 0.6047695027814912
F1 score at threshold 0.13 is 0.61223950301802
F1 score at threshold 0.14 is 0.6191546762589928
F1 score at threshold 0.15 is 0.6253955152015408
F1 score at threshold 0.16 is 0.6300181843614491
F1 score at threshold 0.17 is 0.6342181490099245
F1 score at threshold 0.18 is 0.6384704519119352
F1 score at threshold 0.19 is 0.6415761804156636
F1 score at threshold 0.2 is 0.6435060397324548
F1 score at threshold 0.21 is 0.6468503139558435
F1 score at threshold 0.22 is 0.6490045784248162
F1 score at threshold 0.23 is 0.6509902283534514
F1 score at threshold 0.24 is 0.652383730179774
F1 score at threshold 0.25 is 0.6535217321227099
F1 score at threshold 0.26 is 0.6549115646258504
F1 score at threshold 0.27 is 0.655426997245179
F1 score at threshold 0.28 is 0.6556516885291653
F1 score at threshold 0.29 is 0.6546298392574145
F1 score at threshold 0.3 

# Submission

In [37]:
pred_submission_y = model1.predict([X_submission], batch_size=1024, verbose=1)
pred_submission_y = (pred_submission_y > opt_prob).astype(int)

df_submission = pd.DataFrame({'qid': df_test['qid'].values})
df_submission['prediction'] = pred_submission_y
#df_submission.to_csv("submission.csv", index=False)



# Loading embeddings

## Function to load embeddings from file

In [38]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding="utf8") if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [39]:
glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
paragram =  '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
wiki_news = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'

In [40]:
print("Extracting GloVe embedding")
embed_glove = load_embed(glove)
#print("Extracting Paragram embedding")
#embed_paragram = load_embed(paragram)
#print("Extracting FastText embedding")
#embed_fasttext = load_embed(wiki_news)

Extracting GloVe embedding


# Building Vocabulary and calculating coverage

In [41]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [42]:
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

## Bulding dataset vocabulary

In [43]:
vocab = build_vocab(df['question_text'])

## Calculating coverage for each embedding

In [44]:
print("Glove : ")
oov_glove = check_coverage(vocab, embed_glove)
#print("Paragram : ")
#oov_paragram = check_coverage(vocab, embed_paragram)
#print("FastText : ")
#oov_fasttext = check_coverage(vocab, embed_fasttext)

Glove : 
Found embeddings for 33.02% of vocab
Found embeddings for  88.15% of all text


In [45]:
type(embed_glove)

dict

In [46]:
dict(list(embed_glove.items())[20:22])

{"'s": array([-6.8580e-02,  4.6470e-01,  1.3214e-01,  1.8599e-01, -3.7015e-02,
         3.2988e-01,  1.7865e-01, -2.5977e-01, -2.6022e-01,  2.5728e+00,
        -2.5867e-01, -6.6095e-01,  8.1984e-02,  1.0321e-02, -1.2223e-01,
         9.4609e-03, -8.8657e-02,  5.8367e-01, -1.7465e-02, -3.5569e-01,
        -1.0182e-01,  6.1941e-02, -1.4267e-01, -4.0544e-01,  2.9834e-01,
         1.0003e-01,  3.5899e-02,  2.2920e-01,  3.0278e-01, -1.8259e-01,
        -1.1042e-03,  2.5792e-01, -5.4132e-02,  1.5748e-01,  6.1311e-02,
        -3.0055e-01,  3.3732e-01,  4.0023e-01,  4.2472e-02, -3.0014e-01,
         6.2963e-02,  7.2134e-02,  6.0897e-02, -6.2527e-02,  2.7505e-01,
        -1.3527e-01, -2.1710e-01,  1.9315e-02,  3.8683e-02, -1.2361e-01,
        -7.7210e-02, -1.1320e-01, -9.3050e-02,  3.5217e-01,  1.9300e-01,
         4.8418e-02, -2.0489e-01,  9.6088e-02,  7.7817e-02, -3.7924e-01,
         1.1290e-01, -1.8285e-01, -5.6815e-02,  3.7091e-01,  3.2133e-01,
        -1.6343e-01, -3.0290e-01,  2.0258e-01

# Text pre-processing to improve coverage of embeddings

## Lower casing questions for uniform matching

In [47]:
df['processed_question'] = df['question_text'].apply(lambda x: x.lower())

In [48]:
vocab_low = build_vocab(df['processed_question'])

In [49]:
print("Glove : ")
oov_glove = check_coverage(vocab_low, embed_glove)
#print("Paragram : ")
#oov_paragram = check_coverage(vocab_low, embed_paragram)
#print("FastText : ")
#oov_fasttext = check_coverage(vocab_low, embed_fasttext)

Glove : 
Found embeddings for 27.38% of vocab
Found embeddings for  87.87% of all text


In [50]:
oov_glove[1:20]

[("what's", 13369),
 ('it?', 13158),
 ("i'm", 12814),
 ('do?', 8766),
 ('life?', 7791),
 ('why?', 7369),
 ('you?', 6314),
 ('me?', 6241),
 ('them?', 6141),
 ('time?', 5742),
 ('world?', 5525),
 ('people?', 5008),
 ('quora?', 4657),
 ('like?', 4490),
 ('for?', 4450),
 ('work?', 4219),
 ('2017?', 4050),
 ('mean?', 3980),
 ('2018?', 3594)]

## Adding lower case words to embeddings if missing

In [51]:
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [52]:
print("Glove : ")
add_lower(embed_glove, vocab)
#print("Paragram : ")
#add_lower(embed_paragram, vocab)
#print("FastText : ")
#add_lower(embed_fasttext, vocab)

Glove : 
Added 14725 words to embedding


In [53]:
print("Glove : ")
oov_glove = check_coverage(vocab_low, embed_glove)
#print("Paragram : ")
#oov_paragram = check_coverage(vocab_low, embed_paragram)
#print("FastText : ")
#oov_fasttext = check_coverage(vocab_low, embed_fasttext)

Glove : 
Found embeddings for 30.64% of vocab
Found embeddings for  88.19% of all text


In [54]:
oov_glove[1:20]

[("what's", 13369),
 ('it?', 13158),
 ('do?', 8766),
 ('life?', 7791),
 ('why?', 7369),
 ('you?', 6314),
 ('me?', 6241),
 ('them?', 6141),
 ('time?', 5742),
 ('world?', 5525),
 ('people?', 5008),
 ('quora?', 4657),
 ('like?', 4490),
 ('for?', 4450),
 ('work?', 4219),
 ('2017?', 4050),
 ('mean?', 3980),
 ('2018?', 3594),
 ("isn't", 3509)]

## Removing special characters appropriately
- This ensures better a match to embeddings

In [55]:
punctuations = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

In [56]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in punctuations:
        x = x.replace(punct, '')
    return x

In [57]:
df["processed_question"] = df["processed_question"].progress_apply(lambda x: clean_text(x))

100%|██████████| 1306122/1306122 [00:14<00:00, 90639.58it/s]


In [58]:
vocab_low = build_vocab(df['processed_question'])

In [59]:
print("Glove : ")
oov_glove = check_coverage(vocab_low, embed_glove)
#print("Paragram : ")
#oov_paragram = check_coverage(vocab_low, embed_paragram)
#print("FastText : ")
#oov_fasttext = check_coverage(vocab_low, embed_fasttext)

Glove : 
Found embeddings for 65.90% of vocab
Found embeddings for  99.44% of all text


In [60]:
df['question_text'] = df['processed_question']

In [61]:
X_train, X_test, y_train, y_test, X_submission = data_prep(df)

Splitting dataframe with shape (1306122, 4) into training and test datasets
Filling missing values
Tokenizing 1306122 questions into words
Padding sequences for uniform dimensions
Completed data preparation, returning training, test and submission datasets, split as dependent(X) and independent(Y) variables


# Utilizing embeddings in LSTM classifier
- Following a similar model network structure as previous for comparable results

In [12]:
model1 = Sequential()
#model1.add(Embedding(max_features, embed_size, input_length=maxlen, weights = [embed_glove]))
model1.add(Embedding(max_features, embed_size, input_length=maxlen))
model1.add(Bidirectional(LSTM(128, return_sequences=True)))
model1.add(GlobalMaxPool1D())
model1.add(Dropout(0.2))
model1.add(Dense(64, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(32, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model1.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 20)            400       
_________________________________________________________________
bidirectional_3 (Bidirection (None, 20, 256)           152576    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 256)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_10 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)               

# Fit LSTM model 

In [13]:
%time model1.fit(X_train, y_train, batch_size=512, epochs=5, validation_data=(X_test, y_test), verbose = 1)

Train on 2400000 samples, validate on 100000 samples
Epoch 1/5
    512/2400000 [..............................] - ETA: 7:57:52

InvalidArgumentError:  indices[301,1] = -1 is not in [0, 20)
	 [[node sequential_4/embedding_1/embedding_lookup (defined at /home/lucas/anaconda3/envs/tensorflow/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_distributed_function_9597]

Function call stack:
distributed_function


# Predict using trained model

In [64]:
pred_test_y = model1.predict([X_test], batch_size=1024, verbose=1)




# Calculate optimal probability threshold for classification

In [65]:
opt_prob = None
f1_max = 0

for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1 = metrics.f1_score(y_test, (pred_test_y > thresh).astype(int))
    print('F1 score at threshold {} is {}'.format(thresh, f1))
    
    if f1 > f1_max:
        f1_max = f1
        opt_prob = thresh
        
print('Optimal probabilty threshold is {} for maximum F1 score {}'.format(opt_prob, f1_max))

F1 score at threshold 0.1 is 0.5871446772988884
F1 score at threshold 0.11 is 0.5923871718030147
F1 score at threshold 0.12 is 0.5976102857662078
F1 score at threshold 0.13 is 0.6009870393432036
F1 score at threshold 0.14 is 0.6051867802640202
F1 score at threshold 0.15 is 0.6078347780777793
F1 score at threshold 0.16 is 0.6100589730066645
F1 score at threshold 0.17 is 0.6129376394142179
F1 score at threshold 0.18 is 0.6157241649909265
F1 score at threshold 0.19 is 0.6175640200108969
F1 score at threshold 0.2 is 0.6203893699014064
F1 score at threshold 0.21 is 0.6222963412168991
F1 score at threshold 0.22 is 0.623558232111871
F1 score at threshold 0.23 is 0.6246069790216998
F1 score at threshold 0.24 is 0.6243496357960457
F1 score at threshold 0.25 is 0.6243692178301093
F1 score at threshold 0.26 is 0.6256432019521511
F1 score at threshold 0.27 is 0.6261102193686463
F1 score at threshold 0.28 is 0.6273006962811032
F1 score at threshold 0.29 is 0.6275726886638353
F1 score at threshold 0

# Submission

In [66]:
pred_submission_y = model1.predict([X_submission], batch_size=1024, verbose=1)
pred_submission_y = (pred_submission_y > opt_prob).astype(int)

df_submission = pd.DataFrame({'qid': df_test['qid'].values})
df_submission['prediction'] = pred_submission_y
df_submission.to_csv("submission.csv", index=False)



# Further Improvements:
- Optimizing LSTM hyperparameters
- Optimizing LSTM network structure (adding LSTM, dense, maxpooling etc. layers)
- Text processing to further improve embeddings coverage
- Using all 3 embeddings together/combining the weighted output of 3 LSTM models using each embedding