In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c nlp-getting-started

Downloading test.csv to /content
  0% 0.00/411k [00:00<?, ?B/s]
100% 411k/411k [00:00<00:00, 60.7MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/22.2k [00:00<?, ?B/s]
100% 22.2k/22.2k [00:00<00:00, 22.7MB/s]
Downloading train.csv to /content
  0% 0.00/965k [00:00<?, ?B/s]
100% 965k/965k [00:00<00:00, 58.8MB/s]


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers





  import pandas.util.testing as tm


In [None]:
embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size= 7613
test_portion=.1

corpus = []



In [None]:
df_train = pd.read_csv('train.csv')

# **Making Training dataset Approach 1**
Taking only test column to make dataset 

perform better


In [None]:
sentence_tr = df_train.text
label_tr = df_train.target

# **Making Training dataset Approach 2**

Taking only text, location and keyword column to make dataset

In [None]:
df_train[['keyword']] = df_train[['keyword']].fillna('NoKey')
df_train[['location']] = df_train[['location']].fillna('NoLoc')

sen = df_train.text
loc = df_train.location
keywd = df_train.keyword
label_tr = df_train.target

sentence_tr = []
for i in range(len(sen)):
  tem = '{} {} {}'.format(keywd[i],loc[i],sen[i])
  sentence_tr.append(tem)

In [None]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{"}~\t\n', 
                      oov_token = oov_tok,
                      lower = True,
                      split=" "
                      )
# tokenizer = Tokenizer(oov_token = oov_tok)
tokenizer.fit_on_texts(sentence_tr)

word_index = tokenizer.word_index
vocab_size=len(word_index)
print("total number of words: {}".format(vocab_size))

total number of words: 14037


In [None]:
 def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.replace(r"\w*\d+\w*", "")
    df[text_field] = df[text_field].str.replace("'", "")
    df[text_field] = df[text_field].str.lower()
    return df

df_train = standardize_text(df_train, 'text')
df_train = standardize_text(df_train, 'location')
df_train = standardize_text(df_train, 'keyword')

In [None]:
sorted(tokenizer.word_counts)



In [None]:
sequences = tokenizer.texts_to_sequences(sentence_tr)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

split = int(test_portion * training_size)

test_sequences = padded[0:split]
training_sequences = padded[split:training_size]
test_labels = label_tr[0:split]
training_labels = label_tr[split:training_size]

In [None]:
# Note this is the 100 dimension version of GloVe from Stanford
# I unzipped and hosted it on my site to make this notebook easier
# *********************************
# "word_index" is a dic that map String WORD to Integer NUMBER
# "embeddings_matrix" is a dic that map Integer NUMBER to VECTOR
# *********************************

!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [None]:
print(len(embeddings_matrix))

14038


In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D

from tensorflow.keras.constraints import unit_norm
from tensorflow.keras.constraints import max_norm


In [None]:
model = Sequential()

model.add(Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False))
model.add(Dropout(0.2))
# model.add(Conv1D(64, 5, activation= 'relu'))
# model.add(MaxPooling1D(pool_size=4))
# model.add(Bidirectional(GRU(32, return_sequences= True)))
model.add(Bidirectional(LSTM(32, return_sequences= True,)))
model.add(Bidirectional(LSTM(32)))
# model.add(Dense(32, activation = 'relu'))
# model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

In [None]:
num_epochs = 15
history = model.fit(training_sequences, training_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels), verbose=1)

In [None]:
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['accuracy']
val_acc=history.history['val_accuracy']
loss=history.history['loss']
val_loss=history.history['val_loss']

epochs=range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r')
plt.plot(epochs, val_acc, 'b')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(["Accuracy", "Validation Accuracy"])

plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])

plt.figure()


# Expected Output
# A chart where the validation loss does not increase sharply!

In [None]:
df_train[25:50]