In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [30]:
import nltk 
nltk.download("all")
from nltk.tokenize import word_tokenize
import string
string.punctuation
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers

import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

## Explore Data

In [31]:
train_df = pd.read_csv("../input/nlp-getting-started/train.csv")
test_df = pd.read_csv("../input/nlp-getting-started/test.csv")
train_df.head()

Data Information

In [32]:
train_df.info()

In [33]:
train_df["keyword"].value_counts()

In [34]:
plt.hist(train_df["keyword"].value_counts())

In [35]:
train_df["location"].value_counts()

In [36]:
plt.hist(train_df["location"].value_counts())

In [37]:
train_df["target"].value_counts()

In [38]:
sns.countplot(train_df["target"])

In [39]:
train_df.isnull().sum()

## Preprocessing 

### Remove Punctuation

In [40]:
def remove_punctuation(text):
  without_punctuation ="".join([i for i in text if i not in string.punctuation])
  return without_punctuation

### Remove Noise
which include :
- Links
- HTML tags
- numbers

In [41]:
def remove_noise(text):
  url = re.compile(r"https?://\S+|www\.\S+")
  tag = re.compile(r'<.*?>')
  nums = re.compile(r'\d+')
  clean_text = url.sub(r'',text)
  clean_text = tag.sub(r'',clean_text)
  clean_text = nums.sub(r'',clean_text)
  return clean_text

### Convert To LowerCase

In [42]:
def lower_case(text):
  return text.lower()

### Remove StopWords

In [43]:
eng_stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
  not_stopword = [i for i in text  if i not in eng_stopwords]
  return not_stopword

### Stemming

In [44]:
porter_stemmer = PorterStemmer()

def stemming(text):
  stemmed_tokens = [porter_stemmer.stem(token) for token in text]
  return stemmed_tokens

### Lemmatization 

In [45]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizing(text):
  lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token) for token in text]
  return lemmatized_tokens

### Preprocessing Function

In [46]:
def preprocessing_data(text):
  lower_text = lower_case(remove_noise(remove_punctuation(text)))
  tokens = lemmatizing(stemming(remove_stopwords(word_tokenize(lower_text))))
  return tokens

In [47]:
train_df["tokens"] = train_df["text"].apply(lambda x:preprocessing_data(x))
test_df["tokens"] = test_df["text"].apply(lambda x:preprocessing_data(x))
train_df["tokens"]

### Spliting Data

In [48]:
from sklearn.model_selection import train_test_split
train_feature = train_df['tokens']
train_label = train_df['target']
test_feature = test_df['tokens']
x_train, x_val, y_train, y_val = train_test_split(train_feature,train_label,shuffle = True,test_size=0.2,random_state=42)

## Vectorizing

### Bag Of Words

In [49]:
tok = Tokenizer(num_words=2000, oov_token='UNK')
tok.fit_on_texts(x_train)
x_train_bow = tok.texts_to_matrix(x_train)
x_val_bow = tok.texts_to_matrix(x_val)
x_train_bow

In [50]:
print("Train Shape = ",x_train_bow.shape)
print("Val Shape = ",x_val_bow.shape)

### Sequance 

In [51]:
tok.fit_on_texts(x_train)
x_train_seq = tok.texts_to_sequences(x_train)
x_val_seq = tok.texts_to_sequences(x_val)
x_train_seq

### Convert Labels To Vectors

In [52]:
y_train = np.array(y_train).astype('float32')
y_val = np.array(y_val).astype('float32')

# Models 

## LSTM With BoW

### Padding Train and Val 

In [53]:
maxlen = 2000
training_padded = pad_sequences(x_train_bow,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')
val_padded = pad_sequences(x_val_bow,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')

### Building Model

In [54]:
model = models.Sequential()
model.add(layers.Embedding(1000, 32, input_length=maxlen))
model.add(layers.LSTM(32))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.4))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='Adamax',
    metrics=['accuracy']
)

model.summary()

In [55]:
history = model.fit(
    training_padded, 
    y_train, 
    epochs = 15, 
    batch_size = 512,  
    validation_data=(val_padded, y_val)
)

In [56]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

## LSTM With Seq

### Padding

In [57]:
maxlen = 2000
training_padded = pad_sequences(x_train_seq,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')
val_padded = pad_sequences(x_val_seq,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')

### Model

In [58]:
model = models.Sequential()
model.add(layers.Embedding(1000, 32, input_length=maxlen))
model.add(layers.LSTM(32))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.4))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(
    loss='binary_crossentropy',
    optimizer='Adamax',
    metrics=['accuracy']
)

model.summary()

In [59]:
history = model.fit(
    training_padded, 
    y_train, 
    epochs = 15, 
    batch_size = 512,  
    validation_data=(val_padded, y_val)
)

In [60]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

## BiLSTM With BoW

### Padding

In [61]:
maxlen = 2000
training_padded = pad_sequences(x_train_bow,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')
val_padded = pad_sequences(x_val_bow,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')

### Building Model

In [62]:
model = models.Sequential()
model.add(layers.Embedding(1000, 32, input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(32)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.4))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1, activation='sigmoid'))


model.compile(
    loss='binary_crossentropy',
    optimizer='Adamax',
    metrics=["accuracy"]
)

model.summary()

In [63]:
history = model.fit(
    training_padded, 
    y_train, 
    epochs = 15, 
    batch_size = 512,  
    validation_data=(val_padded, y_val)
)

In [64]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

## BiLSTM With Seq

### Padding

In [65]:
maxlen = 2000
training_padded = pad_sequences(x_train_seq,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')
val_padded = pad_sequences(x_val_seq,
                                maxlen=maxlen, 
                                truncating='post',
                                padding = 'post')

### Model

In [66]:
model = models.Sequential()
model.add(layers.Embedding(1000, 32, input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(32)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.4))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1, activation='sigmoid'))


model.compile(
    loss='binary_crossentropy',
    optimizer='Adamax',
    metrics=["accuracy"]
)

model.summary()

In [67]:
history = model.fit(
    training_padded, 
    y_train, 
    epochs = 15, 
    batch_size = 512,  
    validation_data=(val_padded, y_val)
)

In [68]:
import matplotlib.pyplot as plt
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

## Prediction

In [69]:
tok.fit_on_texts(test_feature)
x_test_bow = tok.texts_to_sequences(test_feature)
testing_padding = pad_sequences(x_test_bow)

In [70]:
evaluation = model.predict(np.array(testing_padding),
                          batch_size = 128,
                          verbose = 1)

In [71]:
submission_df = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission_df["target"] = np.int_(evaluation.round())
submission_df.head()