<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Imports</h1>

In [1]:
import re
import nltk
import json
import spacy
import numpy as np
import pandas as pd
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
spacy.cli.download("en_core_web_sm")
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Constants and Global Variables</h1>

In [3]:
text = pd.read_csv('Collected Datasets/text.csv')

EMOTIONS = ['happiness', 'neutral', 'sadness', 'anger', 'fear', ]
MAPPER = {emotion: [] for emotion in EMOTIONS}

scaler = StandardScaler()
encoder = LabelEncoder()
encoder.classes_ = np.array(EMOTIONS)

nlp = spacy.load("en_core_web_sm")
lemmatizer = nltk.WordNetLemmatizer()
STOP_WORDS = set(stopwords.words("english"))

In [4]:
'''Emotional Lexicons'''
wordMap = pd.read_csv('wordMap.csv')
for emotion in wordMap.columns.to_list():    
    if emotion.lower() in EMOTIONS: MAPPER[emotion.lower()] = wordMap[emotion].to_list()
    if emotion in ['Calm', 'Boredom']: MAPPER['neutral'] += wordMap[emotion].to_list()
    if emotion in ['Excitement', 'Pride']: MAPPER['happiness'] += wordMap[emotion].to_list()
    if emotion in ['Disgust', 'Frustration', 'Contempt']: MAPPER['anger'] += wordMap[emotion].to_list()

In [5]:
'''Contractions'''
with open("Common English Contractions/contractions.json" , 'r') as file:
    contractions = json.load(file)
    
contractions = pd.DataFrame(list(contractions.items()), columns=["Contraction", "Meaning"])
contractions = pd.concat([contractions, pd.read_csv("Common English Contractions/contractions.csv")], ignore_index=True)
contractions.drop_duplicates(inplace=True)
contractions["Contraction"] = contractions["Contraction"].str.lower()
contractions["Meaning"] = contractions["Meaning"].str.lower()
contractions.info()
contractions = contractions.set_index("Contraction").to_dict()["Meaning"]

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, 0 to 262
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Contraction  195 non-null    object
 1   Meaning      195 non-null    object
dtypes: object(2)
memory usage: 4.6+ KB


<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Cleaning and Preprocessing</h1>

In [6]:
def scale(data, f="train"):
    if f == "train":
        data = scaler.fit_transform(data)
    else:
        data = scaler.transform(data)
    return data

In [7]:
def encode(data, f="train"):
    if f == "train":
        data = encoder.fit_transform(data)
    else:
        data = encoder.transform(data)
    return data

In [None]:
def expand(data):
    count = 0
    for contraction, meaning in contractions.items():
        count += data['Text'].apply(lambda line: len(re.findall(rf'\b{contraction}\b', line))).sum()
        data['Text'] = data['Text'].apply(
            lambda line: re.sub(rf'\b{contraction}\b', meaning, line)
            )
        
    print("Number of contractions removed:", count)
    
    return data

In [None]:
def extractNER(data):
    data['Entities'] = data['Text'].apply(
    lambda text: [(ent.text, ent.label_) for ent in nlp(text).ents]
    )
    
    return data

In [None]:
def lexiconScore(data):
    for emotion, keywords in MAPPER.items():
        scores = []
        for text in data['Text']:
            words = text.lower().split()
            counter = Counter(words)
            totalWords = len(words)
            score = sum(counter[word] for word in keywords)
            normalizedScore = score / totalWords if totalWords > 0 else 0.0
            scores.append(normalizedScore)
        data[f"{emotion}Score"] = scores
        
    return data

In [None]:
def stopWordRemoval(data):
    data["Text"] = data["Text"].apply(
        lambda removeStopWords: [word for word in removeStopWords if word not in STOP_WORDS]
        )
    
    return data

In [None]:
def tokenize(data):
    data["Text"] = data["Text"].apply(
        lambda text: word_tokenize(text)
        )
    
    return data

Unnamed: 0,Text,Emotion
0,i expect and i feel content with that,happiness
1,i just couldnt help feeling a little bit bitte...,happiness
2,i dunno how it feels to be completely happy th...,happiness
3,i walk in the door to my house i feel happy,happiness
4,i feel satisfied and happy with my choices today,happiness
