<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Imports</h1>

In [None]:
import re
import nltk
import json
import spacy
import numpy as np
import pandas as pd
from collections import Counter
import gensim.downloader as api
from gensim.models import FastText
from nltk.corpus import stopwords
from tensorflow.keras import layers, Input 
from tensorflow.keras.models import Sequential, Model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Attention, Concatenate, Flatten

In [32]:
word2vec = api.load("word2vec-google-news-300")  
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
spacy.cli.download("en_core_web_sm")
nltk.download('averaged_perceptron_tagger_eng')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Constants and Global Variables</h1>

In [33]:
text = pd.read_csv('Collected Datasets/text.csv')

EMOTIONS = ['happiness', 'neutral', 'sadness', 'anger', 'fear', ]
MAPPER = {emotion: [] for emotion in EMOTIONS}

scaler = StandardScaler()
encoder = LabelEncoder()
encoder.classes_ = np.array(EMOTIONS)

nlp = spacy.load("en_core_web_sm")
lemmatizer = nltk.WordNetLemmatizer()
STOP_WORDS = set(stopwords.words("english"))
STOP_WORDS = STOP_WORDS.difference({'no', 'not', 'nor', 'never', "n't"})

In [34]:
'''Emotional Lexicons'''
wordMap = pd.read_csv('wordMap.csv')
for emotion in wordMap.columns.to_list():
    words = [str(word).lower() for word in wordMap[emotion].dropna()]
    if emotion.lower() in EMOTIONS: MAPPER[emotion.lower()] = words
    if emotion in ['Calm', 'Boredom']: MAPPER['neutral'] += words   
    if emotion in ['Excitement', 'Pride']: MAPPER['happiness'] += words
    if emotion in ['Disgust', 'Frustration', 'Contempt']: MAPPER['anger'] += words

print(MAPPER)


{'happiness': ['joyful', 'ecstatic', 'content', 'cheerful', 'elated', 'delighted', 'pleased', 'radiant', 'euphoric', 'jovial', 'happy', 'merry', 'exuberant', 'overjoyed', 'satisfied', 'grateful', 'sunny', 'bubbly', 'lively', 'gleeful', 'glad', 'in high spirits', 'laughing', 'thrilled', 'blissful', 'carefree', 'exhilarated', 'optimistic', 'chipper', 'buoyant', 'uplifted', 'beaming', 'jubilant', 'proud', 'alive', 'zestful', 'lighthearted', 'upbeat', 'charmed', 'tickled pink', 'rapturous', 'in a good mood', 'on cloud nine', 'pumped', 'excited', 'grinning', 'eager', 'silly', 'pleased as punch', 'mirthful', 'vibrant', 'giddy', 'serene', 'festive', 'playful', 'contented', 'radiant', 'laughing fit', 'high-spirited', 'in a good place', 'carefree', 'light', 'breezy', 'sociable', 'enthusiastic', 'pleased with oneself', 'joyous', 'sweet', 'sanguine', 'delighted beyond words', 'up and about', 'full of life', 'tickled', 'exultant', 'bouncy', 'happy-go-lucky', 'energetic', 'on top of the world', 'hi

In [35]:
'''Contractions'''
with open("Common English Contractions/contractions.json" , 'r') as file:
    contractions = json.load(file)
    
contractions = pd.DataFrame(list(contractions.items()), columns=["Contraction", "Meaning"])
contractions = pd.concat([contractions, pd.read_csv("Common English Contractions/contractions.csv")], ignore_index=True)
contractions.drop_duplicates(inplace=True)
contractions["Contraction"] = contractions["Contraction"].str.lower()
contractions["Meaning"] = contractions["Meaning"].str.lower()
contractions.info()
contractions = contractions.set_index("Contraction").to_dict()["Meaning"]

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, 0 to 262
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Contraction  195 non-null    object
 1   Meaning      195 non-null    object
dtypes: object(2)
memory usage: 4.6+ KB


<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Cleaning and Preprocessing</h1>

<pre>
- All characters are lowercase
- No non alphabetic characters or numbers
</pre>

In [36]:
def scale(data, f="train"):
    if f == "train":
        data = scaler.fit_transform(data)
    else:
        data = scaler.transform(data)
    return data

In [37]:
def encode(data, f="train"):
    if f == "train":
        data = encoder.fit_transform(data)
    else:
        data = encoder.transform(data)
    return data

In [38]:
def filterByTextLength(data, minLen=5, maxLen=35):
    def _lengthFilter(text):
        if isinstance(text, str):
            length = len(text.split())
            return minLen <= length <= maxLen
        return False 
    
    filteredData = data[data["Text"].apply(_lengthFilter)].reset_index(drop=True)
    return filteredData

In [39]:
def expandContractions(data):
    count = 0
    for contraction, meaning in contractions.items():
        count += data['Text'].apply(lambda line: len(re.findall(rf'\b{contraction}\b', line))).sum()
        data['Text'] = data['Text'].apply(
            lambda line: re.sub(rf'\b{contraction}\b', meaning, line)
            )
        
    print("Number of contractions removed:", count)
    
    return data

In [40]:
def lexiconScore(data):
    for emotion, keywords in MAPPER.items():
        scores = []
        totalMatches = 0 
        for text in data['Text']:
            words = text.lower().split()
            counter = Counter(words)
            totalWords = len(words)
            score = sum(counter[word] for word in keywords)
            normalizedScore = score / totalWords if totalWords > 0 else 0.0
            scores.append(normalizedScore)
            totalMatches += score
            
        data[f"{emotion}Score"] = scores
        print(f"Total matched words for emotion '{emotion}': {totalMatches}")

        
    return data

In [41]:
def positionalTaggingAndTokenization(data):
    tokenizedTexts = []
    posTagsList = []
    
    for text in data["Text"]:
        doc = nlp(str(text))
        tokens = [token.text for token in doc]
        posTags = [token.pos_ for token in doc]
        
        tokenizedTexts.append(tokens)
        posTagsList.append(posTags)

    data["Text"] = tokenizedTexts
    data["posTag"] = posTagsList
    
    return data

In [42]:
def stopWordRemoval(data):
    filteredTexts = []
    filteredPOS = []
    
    for tokens, tags in zip(data["Text"], data["posTag"]):
        filteredTokens = []
        filteredTags = []
        
        for word, tag in zip(tokens, tags):
            if word.lower() not in STOP_WORDS:
                filteredTokens.append(word)
                filteredTags.append(tag)
        
        filteredTexts.append(filteredTokens)
        filteredPOS.append(filteredTags)
    
    data["Text"] = filteredTexts
    data["posTag"] = filteredPOS
    
    return data


In [43]:
def fastTextEmbedding(data, vectorSize=100, window=5, minCount=1, epochs=10, maxLen=35):
    tokenizedTexts = data['Text'].tolist()
    embedder = FastText(sentences=tokenizedTexts, vector_size=vectorSize, window=window, min_count=minCount, epochs=epochs)
    
    sequences = []
    for tokens in tokenizedTexts:
        wordVectors = [embedder.wv[word] for word in tokens if word in embedder.wv]
        sequences.append(wordVectors)
    
    zeroVector = np.zeros(vectorSize)
    paddedSequences = []
    for seq in sequences:
        if len(seq) > maxLen:
            seq = seq[:maxLen]
        else:
            seq = seq + [zeroVector] * (maxLen - len(seq))
        paddedSequences.append(seq)
    
    embeddings = np.array(paddedSequences)
    data['embeddings'] = list(embeddings)

    return embedder, data


In [None]:
def encodeAndPadPosTags(data, maxlen=35):
    allPosTags = [tag for seq in data['posTag'] for tag in seq]
    
    posEncoder = LabelEncoder()
    posEncoder.fit(allPosTags)
    
    encodedSequences = [posEncoder.transform(seq) for seq in data['posTag']]
    posPadded = pad_sequences(encodedSequences, maxlen=maxlen, padding='post', truncating='post')
    
    data['posPadded'] = list(posPadded)
    return data, posEncoder

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Saving Preprocessed Data</h1>

In [44]:
text = filterByTextLength(text)
text = expandContractions(text)
text = lexiconScore(text)
text = positionalTaggingAndTokenization(text)
text = stopWordRemoval(text)
text = fastTextEmbedding(text)

Number of contractions removed: 2394
Total matched words for emotion 'happiness': 90858
Total matched words for emotion 'neutral': 39163
Total matched words for emotion 'sadness': 113783
Total matched words for emotion 'anger': 174671
Total matched words for emotion 'fear': 81027


In [46]:
text[1].to_csv("Preprocessed Data/processedText.csv",  index=False)

In [None]:
def buildModel(inputShape, modelType):
    print(f"\nBuilding model with {modelType.__name__}")

    model = Sequential()
    model.add(layers.Input(shape=inputShape))

    model.add(Bidirectional(modelType(120, return_sequences=True)))

    model.add(layers.GlobalAveragePooling1D())

    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(len(EMOTIONS), activation="softmax"))

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    return model
