In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import random

import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
from nltk.stem import SnowballStemmer

import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, Input, GlobalMaxPooling1D, Dropout, Bidirectional, Conv1D, MaxPooling1D, BatchNormalization
                                   
from tensorflow.keras.models import Model, Sequential


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Data**

In [None]:
df = pd.read_csv("/kaggle/input/emotions/text.csv", encoding="latin1")

df.head(15)

In [None]:
# Rename Columns

df.columns = ["Id", "Text", "Label"]

In [None]:
# Drop Id Column 

df.drop(["Id"], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
# Label values

df["Label"].unique()

In [None]:
count = df["Label"].value_counts()
labels = count.index
values = count.values
print(count)

plt.figure(figsize=(8,6))
plt.pie(values, labels=labels, autopct="%1.1f%%", explode=len(labels)*[0.05])
plt.title("Label")
plt.show()

df['Label'].plot(kind='hist', bins=20, title='Label')
plt.gca().spines[['top', 'right',]].set_visible(False)

# **Data Preprocessing**

In [None]:
df.info()

In [None]:
# Remove unnecessary characters, URLs, stopwords etc

corpus = []
for tweet in range(0, 416809):
    tweet = re.sub("http\S+|@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+|[^\w\s]", " ", df["Text"][tweet])
    tweet = tweet.lower()
    tweet = tweet.split()
    sb = SnowballStemmer("english")
    all_stopwords = stopwords.words("english")
    all_stopwords.remove("not")
    tweet = [sb.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = " ".join(tweet)
    corpus.append(tweet)

In [None]:
num_sentences = 20  

random_indices = random.sample(range(len(corpus)), num_sentences)
for index in random_indices:
    print(corpus[index])

# **Tokenization and Padding**

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)


text_sequence = tokenizer.texts_to_sequences(corpus)
text_sequence = pad_sequences(text_sequence)

num_records = len(text_sequence)
max_seqlen = len(text_sequence[0])

print("{:d} sentences, max length: {:d}".format(num_records, max_seqlen))

In [None]:
# Vocabulary 

word2idx = tokenizer.word_index

idx2word = {v:k for k, v in word2idx.items()}

word2idx["PAD"] = 0
idx2word[0] = "PAD"

vocab_size = len(word2idx)
print("Vocab Size", vocab_size)

# **Train Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_sequence, df["Label"], test_size=0.2, random_state=42)

In [None]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

# **Word Embedding**

In [None]:
EMBEDDING_DIM = 300

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

GLOVE_FILE = 'glove.6B.300d.txt'

In [None]:
glove_index = {}

with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        glove_index[word] = coefs
        
print("Found %s word vectors." % len(glove_index))

In [None]:
hits = 0
misses = 0

glove_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, idx in word2idx.items():
    glove_vector = glove_index.get(word)
    if glove_vector is not None:
        glove_matrix[idx] = glove_vector
        hits += 1
    else:
        misses +=1
        
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=EMBEDDING_DIM,
                            trainable=False)

embedding_layer.build((1,))
embedding_layer.set_weights([glove_matrix])

# **Building the Model**

In [None]:
sequence_input = Input(shape=(max_seqlen,), dtype="int32")
embedded_sequences = embedding_layer(sequence_input)

x = Conv1D(32, 3, activation="relu")(embedded_sequences)
x = MaxPooling1D(2)(x)
x = Conv1D(64, 3, activation="relu")(x)
x = MaxPooling1D(2)(x)
x = Conv1D(64, 3, activation="relu")(x)

x = GlobalMaxPooling1D()(x)

x = Dense(64, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)

output = Dense(6, activation="softmax")(x)

model = Model(sequence_input, output)

In [None]:
model.summary()

In [None]:
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [None]:
print("Training on GPU...") if tf.test.is_gpu_available() else print("Training on CPU...")

In [None]:
r = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64)

In [None]:
# Plot the Loss
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.title("LSTM Loss")

In [None]:
# Plot the accuracy 
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.title("LSTM Accuracy")