<div style="background-color: #333; padding: 40px; border: 2px solid #ffd700; border-radius: 10px; color: #ffd700; text-align: center; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">

<h1 style="font-size: 48px; font-weight: bold; color: #ffd700;">Emotions NLP</h1>

<img src="https://media.istockphoto.com/id/1318764563/fr/vectoriel/diverses-%C3%A9motions-et-expressions-faciales-dune-seule-personne.jpg?s=612x612&w=0&k=20&c=rnx8dnDAZHsD8bFTp68t2qGkhico86Cpx9H48iJsra0=" alt="Movie Reel" style="width: 500px; margin: 20px auto; border-radius: 10px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);">
    
</div>

<div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 180%; text-align: center; color: #ffd700; font-weight: bold;"> Table of Contents 
</div>

<ul class="list-group" id="list-tab" role="tablist">
    <li><a href="#1.-Import-Libraries">1. Import Libraries</a></li><br>
    <li><a href="#2.-Load-data">2. Load data</a></li><br>
    <li><a href="#3.-Exploratory-Data-Analysis">3. Exploratory Data Analysis</a></li><br>
    <li><a href="#4.-Modeling">4. Modeling</a></li><br>
</ul>

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">1. Import Libraries</div>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from wordcloud import WordCloud
import re
import string
import regex
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">2. Load data</div>

In [None]:
df = pd.read_csv("/kaggle/input/emotions/text.csv")
df.head()

In [None]:
df=df.drop(columns=["Unnamed: 0"])

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.label.value_counts()

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">3. Exploratory Data Analysis</div>

### I | Check duplicates

In [None]:
duplicated = df.duplicated().sum()
print(duplicated)

### II | Check null and missing value

In [None]:
missing_values = df.isnull().sum()
total_missing_values = (missing_values).sum()
total_cells = np.product(df.shape)
percent_missing_values = (total_missing_values / total_cells)*100
print("Percent of data that is missing", percent_missing_values)
print(missing_values)

### III | Check unique values in each columns

In [None]:
for column in df.columns:
    num_distinct_values = len(df[column].unique())
    print(f"{column}: {num_distinct_values} distinct values")

To enhance text processing efficiency and minimize column size, we create the preprocess_text function to filter out unnecessary elements present in the text column. By eliminating irrelevant components, such as extraneous characters or punctuation, we aim to optimize processing time and reduce the overall size of the text columns.

In [None]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove non-alphanumeric characters and numeric values
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove Emojis
    text = regex.compile(r'\p{Emoji}').sub('', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespaces and stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    text = " ".join(filtered_words)
    
    return text


df['cleaned_text'] = df['text'].apply(preprocess_text)

print(df.head())

In [None]:
# Replace anagream with full sentence
def replace_chat_words(text, anagram_dict):
    words = text.split()
    for i, word in enumerate(words):
        if word.lower() in anagram_dict:
            print(word[i])
            words[i] = anagram_dict[word.lower()]
    return ' '.join(words)

# Load text data and create a dictionary mapping anagrams to their full sentences
anagram_dict = {}
with open("/kaggle/input/chatwords/slang (1).txt", "r") as file:
    for line in file:
        if line:
            anagram, full_sentence = line.strip().split('=')
            anagram_dict[anagram.strip()] = full_sentence.strip()


df['text'] = df['text'].apply(lambda x: replace_chat_words(x, anagram_dict))

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(df['text'].apply(len), bins=30, edgecolor='black')
plt.title('Distribution of Message Lengths')
plt.xlabel('Message Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Define a mapping between numerical labels and corresponding emotions
mapping = {0: 'sadness', 
           1: 'joy',
           2: 'love',
           3: 'anger',
           4: 'fear',
           5: 'surprise'}

# Apply the mapping to create a new 'Emotion' column based on the numerical labels in the 'label' column
df['Emotion'] = df['label'].map(mapping)

In [None]:
plt.figure(figsize=(16,8))
sns.countplot(data = df, x='Emotion', edgecolor='black')
plt.title("Distribution of emotions")
plt.xlabel("emotions")
plt.ylabel("count")
plt.show()

In [None]:
unique_emotion = df['Emotion'].value_counts()
explode = [0] * len(unique_emotion)

plt.pie(
    x= unique_emotion, labels=unique_emotion.index,
    colors=sns.color_palette('Set2'),
    startangle=90,
    autopct='%1.2f%%',
    pctdistance=0.80,
    explode=[0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
)

# Transform the pie in a donut
hole = plt.Circle((0, 0), 0.65, facecolor='white')
plt.gcf().gca().add_artist(hole)
plt.title("Emotion Distribution") 
plt.show()

In [None]:
def make_word_cloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

all_words = ' '.join(df['text'])

make_word_cloud(all_words)

## <div style="border-radius: 10px; border: 2px solid #ffd700; padding: 15px; background-color: #333; font-size: 120%; text-align: center; color: #ffd700; font-weight: bold;">4. Modeling</div>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

In [None]:
# Tokenizing the text into words/tokens using a Tokenizer object
tokenizer = Tokenizer(num_words=50000)
tokenizer.fit_on_texts(X_train)

In [None]:
# Convert text data to sequences and pad them to a fixed length of 100
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100, padding='post')
X_test_padded = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100, padding='post')

In [None]:
print(X_test_padded.shape)
print(X_train_padded.shape)

In [None]:
model = Sequential()

model.add(Embedding(input_dim=50000, output_dim=5, input_shape=(79,)))
model.add(Dropout(0.2))
model.add(Bidirectional(GRU(130,return_sequences=True)))
model.add(Bidirectional(GRU(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Bidirectional(GRU(64)))
model.add(Dense(6, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train_padded, y_train, epochs=5, batch_size=1500, validation_data=(X_test_padded, y_test))

In [None]:
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch') 
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Mark the epoch with the best validation accuracy
best_epoch = np.argmax(history.history['val_accuracy']) + 1
best_val_acc = history.history['val_accuracy'][best_epoch - 1]
plt.scatter(best_epoch, best_val_acc, color='red', label=f'Best Val Accuracy: {best_val_acc:.4f}')
plt.annotate(f'Best Val Accuracy: {best_val_acc:.4f}', (best_epoch, best_val_acc), xytext=(best_epoch+2, best_val_acc-0.1),
             arrowprops=dict(facecolor='red', arrowstyle='->'))

plt.show()


In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Evaluate Test Data
model.evaluate(X_test_padded, y_test)

In [None]:
y_pred = model.predict(X_test_padded)
y_pred = np.argmax(y_pred, axis=1)

In [None]:
cm = confusion_matrix(y_test, y_pred)

emotion_labels = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greys', xticklabels=emotion_labels, yticklabels=emotion_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()