In [1]:
import pandas as pd
import numpy as np

#### Dataset: https://www.kaggle.com/competitions/nlp-getting-started/overview 

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


#### Removing NaN values in each column

In [3]:
train_df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [4]:
test_df.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

 In this code we will only consider keywords and target column. Both the columns do not have any NA values.

#### Removing duplicates 

In [5]:
train_df = train_df[['text','target']]
train_df.shape

(7613, 2)

In [6]:
train_df = train_df.drop_duplicates()
train_df.shape

(7521, 2)

#### Analysis on Target column 

In [6]:
train_df['target'].unique()

array([1, 0])

#### Text cleaning

In [7]:
import re

In [8]:
train_df['text'][19]

'What a goooooooaaaaaal!!!!!!'

Steps To be taken to clean the text
- Remove punctutaions #.,-?! from text containing it. 
- Remove numerics
- convert text to lower case

In [9]:
def clean_data(text):
    
    for i in range(0, len(text)):
        
        # Remove urls
        text[i] = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',"",text[i])
        
        # Remove Hashtags
        text[i] = re.sub('#(\w+)',"",text[i])
        
        # Remove @tags
        text[i] = re.sub('@(\w+)',"",text[i])
        
        # Remove &tags
        text[i] = re.sub('&(\w+)',"",text[i])
        
        # Remove Non ASCII characters
        text[i] = re.sub(r'[^\x00-\x7F]+',' ', text[i])
        
        # Remove numbers [0-9]
        text[i] = re.sub('[0-9]',"",text[i])
        
        # Split words seprated with -
        text[i] = re.sub('-'," ",text[i])
        
        text[i] = re.sub('[#.,;?!)/^(}{:%+=$*\|~_]',' ',text[i])
        
        # Remove square brackets
        text[i] = re.sub('[\[\]]',' ',text[i])
        
        # Remove \n from text
        text[i] = re.sub('\n',' ',text[i])
        
        text[i] = re.sub('[\']',' ',text[i])
        
        # Remove extra space
        text[i] = re.sub(' +', ' ', text[i])
        
        
    return text

#### Data Preprocessing

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import re

In [11]:
# shuffle data of training set

train_df = train_df.sample(frac = 1)

In [12]:
# Clean training data

data_x = train_df['text'].tolist()
data_y = train_df['target'].tolist()

data_x = clean_data(data_x)

In [13]:
print(len(data_x))
print(len(data_y))

7613
7613


In [14]:
# split the data into training and validation sets

train_len = int(0.8 * len(data_x))

train_x = np.array(data_x[0:train_len])
train_y = np.array(data_y[0:train_len])

val_x = np.array(data_x[train_len:])
val_y = np.array(data_y[train_len:])

In [15]:
print("Length of training data ", len(train_x), len(train_y))
print("Length of validation data", len(val_x), len(val_y))

Length of training data  6090 6090
Length of validation data 1523 1523


In [16]:
# Vocabulary Size

voc_size = 30000

In [17]:
# Training a tokenizer

#tokenizer = Tokenizer(num_words=voc_size,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True, split=' ')
tokenizer = Tokenizer(num_words=voc_size,filters="",lower=True, split=' ')
tokenizer.fit_on_texts(train_x)
word_index = tokenizer.word_index

In [18]:
# Padding Sequences so that the length of each sequence is same
# Setting maxlen = None, so that sequences will be padded to the length of the longest individual sequence

train_seq = tokenizer.texts_to_sequences(train_x)
train_pad_seq = pad_sequences(train_seq, padding='post', maxlen=None)

MAX_SEQUENCE_LENGTH = train_pad_seq.shape[1]

print(train_pad_seq.shape)
print(MAX_SEQUENCE_LENGTH)

(6090, 33)
33


In [19]:
# Converting val data to sequences

val_seq = tokenizer.texts_to_sequences(val_x)
val_pad_seq = pad_sequences(val_seq, padding='post', maxlen=MAX_SEQUENCE_LENGTH)

print(val_pad_seq.shape)

(1523, 33)


In [20]:
print(len(tokenizer.word_index))

11351


In [21]:
voc_size = len(tokenizer.word_index)+1
print(voc_size)

11352


#### Use of Google Word2Vec Embeddings

In [22]:
from gensim.models import keyedvectors

In [23]:
from tensorflow.keras.layers import Embedding,Bidirectional, Input, LSTM, Dropout, concatenate
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D, Lambda, Reshape, Conv1D, GlobalMaxPooling1D
from tensorflow.keras import Model, Sequential
from tensorflow.keras.utils import plot_model
import tensorflow as tf

In [24]:
filename = 'GoogleNews-vectors-negative300.bin'
google_embeddings_model = keyedvectors.load_word2vec_format(filename, binary=True)

In [25]:
# The index of vector for a token which is present in the vocabulary

rock_idx = google_embeddings_model.key_to_index["rock"]
rock_idx

2453

In [26]:
# The embedded vector for specific token which is present in the vocabulary

vector1 = google_embeddings_model.get_vector("goal",norm=True)
vector2 = google_embeddings_model.get_vector("goal")

print(vector1.shape)
print(vector2.shape)

(300,)
(300,)


In [27]:
print(min(vector1))
print(max(vector1))

-0.14448193
0.14587118


In [28]:
print(min(vector2))
print(max(vector2))

-0.40625
0.41015625


In [29]:
def get_weight_matrix(model, vocab):
    
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 300))
    
    for word, i in vocab.items():
        try:
            weight_matrix[i] = model.get_vector(word,norm=True)
        except:
            weight_matrix[i] = np.random.uniform(low=-0.1, high=0.1, size=300)
            
    return weight_matrix

In [30]:
embedding_vectors = get_weight_matrix(google_embeddings_model,tokenizer.word_index)

In [32]:
embedding_vectors.shape

(11352, 300)

##### Model Architecture based on : https://arxiv.org/abs/1408.5882

In [34]:
EMBEDDING_DIM =300
filter_sizes=[3, 4, 5]
num_filters=[100, 100, 100]

In [37]:
def ConvNet(embeddings, max_sequence_length, voc_size, embedding_dim, num_classes, fine_tune):
 
    embedding_layer = Embedding(voc_size,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=fine_tune)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    
    convs = []
    filter_sizes=[3, 4, 5]
    num_filters=[100, 100, 100]
    
    for i in range(0,len(filter_sizes)):
        l_conv = Conv1D(filters=num_filters[i], kernel_size=filter_sizes[i], activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        
        convs.append(l_pool)
        
    l_merge = concatenate(convs, axis=1)
    x = Dropout(0.5)(l_merge)
    x = Dense(np.sum(num_filters), activation='relu')(x)
    
    preds = Dense((num_classes-1), activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    model.summary()
    
    return model

In [38]:
model = ConvNet(embedding_vectors, MAX_SEQUENCE_LENGTH,  voc_size, 300,2, False)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 33)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 33, 300)      3405600     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 31, 100)      90100       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 30, 100)      120100      embedding[0][0]                  
______________________________________________________________________________________________

2022-05-13 23:43:03.801854: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-13 23:43:03.829340: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [39]:
num_epochs = 10
batch_size = 32
hist = model.fit(x=train_pad_seq, y=train_y, epochs=num_epochs, validation_data=(val_pad_seq,val_y),workers=4,
                 use_multiprocessing=True,batch_size=batch_size)

2022-05-13 23:43:12.482658: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
