<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Imports</h1>

In [57]:
import re
import ast
import nltk
import json
import spacy
import numpy as np
import pandas as pd
from collections import Counter
import gensim.downloader as api
from nltk.corpus import stopwords
from tensorflow.keras import layers, Input 
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import  Model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, Dense, Dropout, Concatenate, GlobalMaxPooling1D
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

In [2]:
word2vec = api.load("word2vec-google-news-300")  
# nltk.download("punkt_tab")
# nltk.download("stopwords")
# nltk.download("wordnet")
# spacy.cli.download("en_core_web_sm")
# nltk.download('averaged_perceptron_tagger_eng')
# nlp = spacy.load("en_core_web_sm")

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Constants and Global Variables</h1>

In [34]:
# text = pd.read_csv('Collected Datasets/text.csv')

EMOTIONS = ['happiness', 'neutral', 'sadness', 'anger', 'fear', ]
MAPPER = {emotion: [] for emotion in EMOTIONS}

scaler = StandardScaler()
encoder = LabelEncoder()
encoder.classes_ = np.array(EMOTIONS)

nlp = spacy.load("en_core_web_sm")
lemmatizer = nltk.WordNetLemmatizer()
STOP_WORDS = set(stopwords.words("english"))
STOP_WORDS = STOP_WORDS.difference({'no', 'not', 'nor', 'never', "n't"})

MODELS = [layers.LSTM, layers.GRU, layers.SimpleRNN]
BATCH_SIZE = 32
EPOCHS = 12
POS_EMBED_DIM = 10
LSTM_UNITS = 128
DENSE_UNITS = 64
DROPOUT_RATE = 0.3
MAXLEN = 35


In [4]:
'''Emotional Lexicons'''
wordMap = pd.read_csv('wordMap.csv')
for emotion in wordMap.columns.to_list():
    words = [str(word).lower() for word in wordMap[emotion].dropna()]
    if emotion.lower() in EMOTIONS: MAPPER[emotion.lower()] = words
    if emotion in ['Calm', 'Boredom']: MAPPER['neutral'] += words   
    if emotion in ['Excitement', 'Pride']: MAPPER['happiness'] += words
    if emotion in ['Disgust', 'Frustration', 'Contempt']: MAPPER['anger'] += words

print(MAPPER)


{'happiness': ['joyful', 'ecstatic', 'content', 'cheerful', 'elated', 'delighted', 'pleased', 'radiant', 'euphoric', 'jovial', 'happy', 'merry', 'exuberant', 'overjoyed', 'satisfied', 'grateful', 'sunny', 'bubbly', 'lively', 'gleeful', 'glad', 'in high spirits', 'laughing', 'thrilled', 'blissful', 'carefree', 'exhilarated', 'optimistic', 'chipper', 'buoyant', 'uplifted', 'beaming', 'jubilant', 'proud', 'alive', 'zestful', 'lighthearted', 'upbeat', 'charmed', 'tickled pink', 'rapturous', 'in a good mood', 'on cloud nine', 'pumped', 'excited', 'grinning', 'eager', 'silly', 'pleased as punch', 'mirthful', 'vibrant', 'giddy', 'serene', 'festive', 'playful', 'contented', 'radiant', 'laughing fit', 'high-spirited', 'in a good place', 'carefree', 'light', 'breezy', 'sociable', 'enthusiastic', 'pleased with oneself', 'joyous', 'sweet', 'sanguine', 'delighted beyond words', 'up and about', 'full of life', 'tickled', 'exultant', 'bouncy', 'happy-go-lucky', 'energetic', 'on top of the world', 'hi

In [5]:
'''Contractions'''
with open("Common English Contractions/contractions.json" , 'r') as file:
    contractions = json.load(file)
    
contractions = pd.DataFrame(list(contractions.items()), columns=["Contraction", "Meaning"])
contractions = pd.concat([contractions, pd.read_csv("Common English Contractions/contractions.csv")], ignore_index=True)
contractions.drop_duplicates(inplace=True)
contractions["Contraction"] = contractions["Contraction"].str.lower()
contractions["Meaning"] = contractions["Meaning"].str.lower()
contractions.info()
contractions = contractions.set_index("Contraction").to_dict()["Meaning"]

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, 0 to 262
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Contraction  195 non-null    object
 1   Meaning      195 non-null    object
dtypes: object(2)
memory usage: 4.6+ KB


<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Cleaning and Preprocessing</h1>

<pre>
- All characters are lowercase
- No non alphabetic characters or numbers
</pre>

In [6]:
def encode(data, f="train"):
    if f == "train":
        data = encoder.fit_transform(data)
    else:
        data = encoder.transform(data)
    return data

In [35]:
def scale(data, f="train"):
    if f == "train":
        data = scaler.fit_transform(data)
    else:
        data = scaler.transform(data)
    return data

In [7]:
def filterByTextLength(data, minLen=5, maxLen=35):
    def _lengthFilter(text):
        if isinstance(text, str):
            length = len(text.split())
            return minLen <= length <= maxLen
        return False 
    
    filteredData = data[data["Text"].apply(_lengthFilter)].reset_index(drop=True)
    return filteredData

In [8]:
def expandContractions(data):
    count = 0
    for contraction, meaning in contractions.items():
        count += data['Text'].apply(lambda line: len(re.findall(rf'\b{contraction}\b', line))).sum()
        data['Text'] = data['Text'].apply(
            lambda line: re.sub(rf'\b{contraction}\b', meaning, line)
            )
        
    print("Number of contractions removed:", count)
    
    return data

In [9]:
def lexiconScore(data):
    for emotion, keywords in MAPPER.items():
        scores = []
        totalMatches = 0 
        for text in data['Text']:
            words = text.lower().split()
            counter = Counter(words)
            totalWords = len(words)
            score = sum(counter[word] for word in keywords)
            normalizedScore = score / totalWords if totalWords > 0 else 0.0
            scores.append(normalizedScore)
            totalMatches += score
            
        data[f"{emotion}Score"] = scores
        print(f"Total matched words for emotion '{emotion}': {totalMatches}")

        
    return data

In [10]:
def positionalTaggingAndTokenization(data):
    tokenizedTexts = []
    posTagsList = []
    
    for text in data["Text"]:
        doc = nlp(str(text))
        tokens = [token.text for token in doc]
        posTags = [token.pos_ for token in doc]
        
        tokenizedTexts.append(tokens)
        posTagsList.append(posTags)

    data["Text"] = tokenizedTexts
    data["posTag"] = posTagsList
    
    return data

In [11]:
def stopWordRemoval(data):
    filteredTexts = []
    filteredPOS = []
    
    for tokens, tags in zip(data["Text"], data["posTag"]):
        filteredTokens = []
        filteredTags = []
        
        for word, tag in zip(tokens, tags):
            if word.lower() not in STOP_WORDS:
                filteredTokens.append(word)
                filteredTags.append(tag)
        
        filteredTexts.append(filteredTokens)
        filteredPOS.append(filteredTags)
    
    data["Text"] = filteredTexts
    data["posTag"] = filteredPOS
    
    return data


In [12]:
def word2vecEmbedding(data, word2vec, vectorSize=300, maxLen=35):
    tokenizedTexts = data['Text'].tolist()
    zeroVector = np.zeros(vectorSize, dtype=np.float32)
    sequences = []

    for tokens in tokenizedTexts:
        wordVectors = [word2vec[word] for word in tokens if word in word2vec]
        padded = wordVectors[:maxLen] + [zeroVector] * max(0, maxLen - len(wordVectors))
        sequences.append(np.array(padded, dtype=np.float32)) 

    data['embeddings'] = sequences  
    return word2vec, data


In [13]:
def encodeAndPadPosTags(data, maxlen=35):
    allPosTags = [tag for seq in data['posTag'] for tag in seq]
    
    posEncoder = LabelEncoder()
    posEncoder.fit(allPosTags)
    
    encodedSequences = [posEncoder.transform(seq) for seq in data['posTag']]
    posPadded = pad_sequences(encodedSequences, maxlen=maxlen, padding='post', truncating='post')
    
    data['posPadded'] = list(posPadded)
    return data, posEncoder

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Saving Preprocessed Data</h1>

In [14]:
# text = filterByTextLength(text)
# text = expandContractions(text)
# text = lexiconScore(text)
# text = positionalTaggingAndTokenization(text)
# text = stopWordRemoval(text)

In [55]:
def buildModel(maxlen, embeddingDim, posVocabSize, lexiconScoreDim, modelType):
    print(f"\nBuilding model with {modelType.__name__}")
    
    embeddingsInput = Input(shape=(maxlen, embeddingDim), name="embeddingsInput")
    posInput = Input(shape=(maxlen,), name="posInput")
    lexiconInput = Input(shape=(lexiconScoreDim,), name="lexiconInput")

    posEmbedding = Embedding(
        input_dim=posVocabSize, 
        output_dim=POS_EMBED_DIM,
        name="posEmbedding"
    )(posInput)

    embeddingsSeq = Bidirectional(
        modelType(LSTM_UNITS, return_sequences=True),
        name="rnnEmbedding"
    )(embeddingsInput)
    embeddingsSeq = GlobalMaxPooling1D()(embeddingsSeq)

    posSeq = Bidirectional(
        modelType(LSTM_UNITS, return_sequences=True),
        name="rnnPos"
    )(posEmbedding)
    posSeq = GlobalMaxPooling1D()(posSeq)

    print(f"embeddingsSeq shape: {embeddingsSeq.shape}")  
    print(f"posSeq shape: {posSeq.shape}")               
    print(f"lexiconInput shape: {lexiconInput.shape}") 

    features = Concatenate(name="features")([embeddingsSeq, posSeq, lexiconInput])
    
    totalFeatureDim = 2 * LSTM_UNITS + 2 * LSTM_UNITS + lexiconScoreDim  
    print(f"Concatenated feature shape: (None, {totalFeatureDim})")

    features = Dense(DENSE_UNITS, activation="relu")(features)
    features = Dropout(DROPOUT_RATE)(features)
    output = Dense(5, activation="softmax", name="output")(features)

    model = Model(inputs=[embeddingsInput, posInput, lexiconInput], outputs=output)
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    model.summary()
    return model, modelType.__name__


In [16]:
def evaluateModel(yTrue, yPred):
    print(classification_report(yTrue, yPred, target_names=EMOTIONS))

    accuracy = accuracy_score(yTrue, yPred)
    print(f"Accuracy: {accuracy:.4f}")

    precision = precision_score(yTrue, yPred, average="weighted")
    print(f"Precision: {precision:.4f}")

    recall = recall_score(yTrue, yPred, average="weighted")
    print(f"Recall: {recall:.4f}")

    f1 = f1_score(yTrue, yPred, average="weighted")
    print(f"F1 Score: {f1:.4f}")

    cm = confusion_matrix(yTrue, yPred)
    print(f"Confusion Matrix:\n{cm}")

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
def train(model, xTrain, yTrain, xVal, yVal, name):
    print(f"\nTraining {name} model...")
    model.fit(
        [
           xTrain[0],
           xTrain[1],
           xTrain[2],
        ],
        yTrain,
        validation_data=(
            [
                xVal[0],
                xVal[1],
                xVal[2],
            ],
            yVal
        ),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=True,
    )
    return model


In [None]:
def compare(embeddingDim, posVocabSize, lexiconCols, xTrain, yTrain, xVal, yVal):
    for modelType in MODELS:
        model, name = buildModel(MAXLEN, embeddingDim, posVocabSize, len(lexiconCols), modelType)
        model = train(model, xTrain, yTrain, xVal , yVal, name)
        savePath = f"Models/{name}.h5"
        model.save(savePath)
        print(f"Model saved to {savePath}")

In [51]:
data = pd.read_csv("Preprocessed Data/processedText.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620227 entries, 0 to 620226
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Text            620227 non-null  object 
 1   Emotion         620227 non-null  object 
 2   happinessScore  620227 non-null  float64
 3   neutralScore    620227 non-null  float64
 4   sadnessScore    620227 non-null  float64
 5   angerScore      620227 non-null  float64
 6   fearScore       620227 non-null  float64
 7   posTag          620227 non-null  object 
dtypes: float64(5), object(3)
memory usage: 37.9+ MB


In [52]:
def stratifiedSample(data, labelColumn, samplesPerClass):
    return (
        data.groupby(labelColumn, group_keys=False)
        .apply(lambda x: x.sample(min(len(x), samplesPerClass), random_state=42))
        .reset_index(drop=True)
    )

data = stratifiedSample(data, labelColumn='Emotion', samplesPerClass=20000)
embedder, data = word2vecEmbedding(data, word2vec)


In [48]:
data.info()
csv = data.copy()
csv.to_csv("Preprocessed Data/processedTextMedium.csv", index=False)
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Text            100000 non-null  object 
 1   Emotion         100000 non-null  object 
 2   happinessScore  100000 non-null  float64
 3   neutralScore    100000 non-null  float64
 4   sadnessScore    100000 non-null  float64
 5   angerScore      100000 non-null  float64
 6   fearScore       100000 non-null  float64
 7   posTag          100000 non-null  object 
 8   embeddings      100000 non-null  object 
dtypes: float64(5), object(4)
memory usage: 6.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Text            100000 non-null  object 
 1   Emotion         100000 non-null  object 
 2   happinessScore  100000 non-null  fl

In [53]:
def prepareData(data):
    data['posTag'] = data['posTag'].apply(ast.literal_eval)
    data, posEncoder = encodeAndPadPosTags(data)

    posVocabSize = len(posEncoder.classes_)
    embeddingDim = data['embeddings'][0].shape[1]
    lexiconCols = [f"{emotion}Score" for emotion in EMOTIONS]
    
    data[lexiconCols] = scale(data[lexiconCols], f="train")

    xEmbeddings = np.stack(data['embeddings'].to_numpy()).astype(np.float32)
    xPos = np.array(data['posPadded'].tolist())
    xLex = data[lexiconCols].values

    labelEncoder = LabelEncoder()
    yEncoded = labelEncoder.fit_transform(data['Emotion'])
    yCategorical = to_categorical(yEncoded)

    xTrainEmbed, xValEmbed, xTrainPos, xValPos, xTrainLex, xValLex, yTrain, yVal = train_test_split(
        xEmbeddings, xPos, xLex, yCategorical, test_size=0.2, random_state=42
    )
    
    xTrain = [xTrainEmbed, xTrainPos, xTrainLex]
    xVal = [xValEmbed, xValPos, xValLex]
    return xTrain, yTrain, xVal, yVal, embeddingDim, posVocabSize, lexiconCols


xTrain, yTrain, xVal, yVal, embeddingDim, posVocabSize, lexiconCols = prepareData(data)

In [58]:
compare(embeddingDim, posVocabSize, lexiconCols, xTrain, yTrain, xVal, yVal)



Building model with LSTM
embeddingsSeq shape: (None, 256)
posSeq shape: (None, 256)
lexiconInput shape: (None, 5)
Concatenated feature shape: (None, 517)



Training LSTM model...
Epoch 1/12




[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m507s[0m 201ms/step - accuracy: 0.4451 - loss: 1.3521 - val_accuracy: 0.5906 - val_loss: 1.0337
Epoch 2/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m560s[0m 224ms/step - accuracy: 0.6050 - loss: 1.0027 - val_accuracy: 0.6453 - val_loss: 0.8796
Epoch 3/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m488s[0m 195ms/step - accuracy: 0.6590 - loss: 0.8664 - val_accuracy: 0.6626 - val_loss: 0.8246
Epoch 4/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 213ms/step - accuracy: 0.6767 - loss: 0.8061 - val_accuracy: 0.6781 - val_loss: 0.7908
Epoch 5/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m474s[0m 190ms/step - accuracy: 0.6869 - loss: 0.7789 - val_accuracy: 0.6844 - val_loss: 0.7821
Epoch 6/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m528s[0m 211ms/step - accuracy: 0.6919 - loss: 0.7586 - val_accuracy: 0.6851 - val_loss: 0.7679
Epo


Training GRU model...
Epoch 1/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m523s[0m 208ms/step - accuracy: 0.4234 - loss: 1.3801 - val_accuracy: 0.5595 - val_loss: 1.0997
Epoch 2/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m544s[0m 218ms/step - accuracy: 0.5823 - loss: 1.0628 - val_accuracy: 0.6378 - val_loss: 0.9373
Epoch 3/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 193ms/step - accuracy: 0.6358 - loss: 0.9188 - val_accuracy: 0.6558 - val_loss: 0.8551
Epoch 4/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m547s[0m 219ms/step - accuracy: 0.6578 - loss: 0.8634 - val_accuracy: 0.6584 - val_loss: 0.8535
Epoch 5/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m483s[0m 193ms/step - accuracy: 0.6709 - loss: 0.8253 - val_accuracy: 0.6597 - val_loss: 0.8301
Epoch 6/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m544s[0m 218ms/step - accuracy: 0.6730 - loss: 0.8071 - val_accur


Training SimpleRNN model...
Epoch 1/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 74ms/step - accuracy: 0.3862 - loss: 1.4374 - val_accuracy: 0.5508 - val_loss: 1.1430
Epoch 2/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 72ms/step - accuracy: 0.5365 - loss: 1.1700 - val_accuracy: 0.5875 - val_loss: 1.0611
Epoch 3/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 68ms/step - accuracy: 0.5841 - loss: 1.0598 - val_accuracy: 0.6148 - val_loss: 0.9813
Epoch 4/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 73ms/step - accuracy: 0.6058 - loss: 1.0091 - val_accuracy: 0.6328 - val_loss: 0.9121
Epoch 5/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 69ms/step - accuracy: 0.6169 - loss: 0.9760 - val_accuracy: 0.6423 - val_loss: 0.8879
Epoch 6/12
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 74ms/step - accuracy: 0.6268 - loss: 0.9469 - val_accur