In [1]:
import pandas as pd
import numpy as np

#### Dataset: https://www.kaggle.com/competitions/nlp-getting-started/overview

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


#### Checking for NaN values in each column

In [3]:
train_df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
test_df.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

 In this code we will only consider keywords and target column. Both the columns do not have any NA values.

### Removing duplicates 

In [5]:
train_df = train_df[['text','target']]
train_df.shape

(7613, 2)

In [6]:
train_df = train_df.drop_duplicates()
train_df.shape

(7521, 2)

#### Analysis on Target column

In [7]:
train_df['target'].unique()

array([1, 0])

#### Text cleaning

In [8]:
import re

In [9]:
train_df['text'][19]

'What a goooooooaaaaaal!!!!!!'

Steps To be taken to clean the text
- Remove punctutaions #.,-?! from text containing it. 
- Remove numerics
- convert text to lower case

In [10]:
def clean_data(text):
    
    for i in range(0, len(text)):
        
        # Remove urls
        text[i] = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',"",text[i])
        
        # Remove Hashtags
        text[i] = re.sub('#(\w+)',"",text[i])
        
        # Remove @tags
        text[i] = re.sub('@(\w+)',"",text[i])
        
        # Remove &tags
        text[i] = re.sub('&(\w+)',"",text[i])
        
        # Remove Non ASCII characters
        text[i] = re.sub(r'[^\x00-\x7F]+',' ', text[i])
        
        # Remove numbers [0-9]
        text[i] = re.sub('[0-9]',"",text[i])
        
        # Split words seprated with -
        text[i] = re.sub('-'," ",text[i])
        
        text[i] = re.sub('[#.,;?!)/^(}{:%+=$*\|~_]',' ',text[i])
        
        # Remove square brackets
        text[i] = re.sub('[\[\]]',' ',text[i])
        
        # Remove \n from text
        text[i] = re.sub('\n',' ',text[i])
        
        text[i] = re.sub('[\']',' ',text[i])
        
        # Remove extra space
        text[i] = re.sub(' +', ' ', text[i])
        
        
    return text

### Data Preprocessing

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import re

In [12]:
# shuffle data of training set

train_df = train_df.sample(frac = 1)

In [13]:
# Clean training data

data_x = train_df['text'].tolist()
data_y = train_df['target'].tolist()

data_x = clean_data(data_x)

In [14]:
print(len(data_x))
print(len(data_y))

7521
7521


In [15]:
# split the data into training and validation sets

train_len = int(0.8 * len(data_x))

train_x = np.array(data_x[0:train_len])
train_y = np.array(data_y[0:train_len])

val_x = np.array(data_x[train_len:])
val_y = np.array(data_y[train_len:])

In [16]:
print("Length of training data ", len(train_x), len(train_y))
print("Length of validation data", len(val_x), len(val_y))

Length of training data  6016 6016
Length of validation data 1505 1505


In [17]:
# Vocabulary Size

voc_size = 30000

In [18]:
# Training a tokenizer

#tokenizer = Tokenizer(num_words=voc_size,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True, split=' ')
tokenizer = Tokenizer(num_words=voc_size,filters="",lower=True, split=' ')
tokenizer.fit_on_texts(train_x)
word_index = tokenizer.word_index

In [19]:
# Padding Sequences so that the length of each sequence is same
# Setting maxlen = None, so that sequences will be padded to the length of the longest individual sequence

train_seq = tokenizer.texts_to_sequences(train_x)
train_pad_seq = pad_sequences(train_seq, padding='post', maxlen=None)

MAX_SEQUENCE_LENGTH = train_pad_seq.shape[1]

print(train_pad_seq.shape)
print(MAX_SEQUENCE_LENGTH)

(6016, 33)
33


In [20]:
# Converting val data to sequences

val_seq = tokenizer.texts_to_sequences(val_x)
val_pad_seq = pad_sequences(val_seq, padding='post', maxlen=MAX_SEQUENCE_LENGTH)

print(val_pad_seq.shape)

(1505, 33)


In [21]:
print(len(tokenizer.word_index))

11386


In [22]:
voc_size = len(tokenizer.word_index)+1

### SIMPLE MODEL CREATION

In [23]:
from tensorflow.keras.layers import Embedding,Bidirectional, Input, LSTM, Dropout, concatenate
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D, Lambda, Reshape
from tensorflow.keras import Model, Sequential
from tensorflow.keras.utils import plot_model
import tensorflow as tf

In [24]:
EMBEDDING_DIM = 300

model = Sequential()
model.add(Embedding(input_dim = voc_size,output_dim = EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(LSTM(100))
model.add(Flatten())
model.add(Dense(1, activation= 'sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 33, 300)           3416100   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
flatten (Flatten)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 3,576,601
Trainable params: 3,576,601
Non-trainable params: 0
_________________________________________________________________


2022-05-13 23:26:27.429630: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-13 23:26:27.431448: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [25]:
# fit the model

history = model.fit(x=train_pad_seq, y=train_y, epochs=10, validation_data=(val_pad_seq,val_y),workers=4, use_multiprocessing=True)

2022-05-13 23:26:34.439572: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Use of Google Word2Vec Embeddings

In [26]:
from gensim.models import keyedvectors

In [27]:
filename = 'GoogleNews-vectors-negative300.bin'
google_embeddings_model = keyedvectors.load_word2vec_format(filename, binary=True)

In [28]:
# The index of vector for a token which is present in the vocabulary

rock_idx = google_embeddings_model.key_to_index["rock"]
rock_idx

2453

In [29]:
# The embedded vector for specific token which is present in the vocabulary

vector1 = google_embeddings_model.get_vector("goal",norm=True)
vector2 = google_embeddings_model.get_vector("goal")

print(vector1.shape)
print(vector2.shape)

(300,)
(300,)


In [32]:
def get_weight_matrix(model, vocab):
    
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    
    for word, i in vocab.items():
        try:
            weight_matrix[i] = model.get_vector(word,norm=True)
        except:
            weight_matrix[i] = np.random.uniform(low=-0.1, high=0.1, size=300)
            
    return weight_matrix

In [33]:
embedding_vectors = get_weight_matrix(google_embeddings_model,tokenizer.word_index)

In [34]:
EMBEDDING_DIM =300

model1 = Sequential()
model1.add(Embedding(input_dim = voc_size,output_dim = EMBEDDING_DIM, weights=[embedding_vectors], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model1.add(LSTM(100))
model1.add(Flatten())
model1.add(Dense(1, activation= 'sigmoid'))

# compile the model
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 33, 300)           3416100   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               160400    
_________________________________________________________________
flatten_1 (Flatten)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 3,576,601
Trainable params: 160,501
Non-trainable params: 3,416,100
_________________________________________________________________


In [35]:
# fit the model

history = model1.fit(x=train_pad_seq, y=train_y, epochs=20, validation_data=(val_pad_seq,val_y),workers=4, use_multiprocessing=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Use of Glove Embeddings 

In [36]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import keyedvectors

In [37]:
# convert the GloVe file format to the Word2Vec file format. Once converted, the file can be loaded just like Word2Vec

glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

  glove2word2vec(glove_input_file, word2vec_output_file)


(400000, 100)

In [38]:
# Load Glove Embeddings

filename = 'glove.6B.100d.txt.word2vec'
glove_embeddings_model = keyedvectors.load_word2vec_format(filename, binary=False)

In [39]:
# The embedded vector for specific token which is present in the vocabulary

vector1 = glove_embeddings_model.get_vector("goal")

print(vector1.shape)

(100,)


In [40]:
def get_weight_matrix(model, vocab):
    
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 100))
    
    for word, i in vocab.items():
        try:
            weight_matrix[i] = model.get_vector(word, norm=True)
        except:
            weight_matrix[i] = np.random.uniform(low=-0.1, high=0.1, size=100)
            
    return weight_matrix

In [41]:
embedding_vectors = get_weight_matrix(glove_embeddings_model,tokenizer.word_index)

In [42]:
EMBEDDING_DIM =100

model1 = Sequential()
model1.add(Embedding(input_dim = voc_size,output_dim = EMBEDDING_DIM, weights=[embedding_vectors], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model1.add(LSTM(100))
model1.add(Flatten())
model1.add(Dense(1, activation= 'sigmoid'))

# compile the model
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 33, 100)           1138700   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
flatten_2 (Flatten)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,219,201
Trainable params: 80,501
Non-trainable params: 1,138,700
_________________________________________________________________


In [43]:
# fit the model

history = model1.fit(x=train_pad_seq, y=train_y, epochs=10, validation_data=(val_pad_seq,val_y),workers=4, use_multiprocessing=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
