In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# # Download NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...


True

In [4]:
class MELDDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Utterance']
        emotion = self.data.iloc[idx]['Emotion']
        
        sample = {'text': text, 'emotion': emotion}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

In [5]:
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def __call__(self, sample):
        text = sample['text']
        # Tokenization
        tokens = word_tokenize(text)
        # Lowercasing
        tokens = [token.lower() for token in tokens]
        # Removing punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        # Removing stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        # Lemmatization
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        # Removing special characters and numbers
        tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens if token]
        # Join tokens back into a string
        preprocessed_text = ' '.join(tokens)

        return {'text': preprocessed_text, 'emotion': sample['emotion']}

In [45]:
csv_file_path = r'D:\College\Fourth Year\GP\Meld\train_sent_emo.csv'
meld_dataset = MELDDataset(csv_file=csv_file_path)

# Split the dataset into training and validation sets
train_dataset, val_dataset = train_test_split(meld_dataset, test_size=0.2, random_state=42)

# Create DataLoader instances for training and validation sets
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [28]:
def get_preprocessing_pipeline(k=10000):
    return Pipeline([
        ('feature_extraction', TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True)),
        ('feature_selection', SelectKBest(chi2, k=k))
    ])

# Functions to get different classifiers with specified k and class weights
def get_linear_svm_classifier(k, class_weight=None):
    return Pipeline([
        ('preprocessing', get_preprocessing_pipeline(k)),
        ('classifier', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False, class_weight=class_weight))
    ])

def get_svm_classifier(k, class_weight=None):
    return Pipeline([
        ('preprocessing', get_preprocessing_pipeline(k)),
        ('classifier', SVC(kernel='rbf', probability=True, class_weight=class_weight))
    ])

def get_decision_tree_classifier(k, class_weight=None):
    return Pipeline([
        ('preprocessing', get_preprocessing_pipeline(k)),
        ('classifier', DecisionTreeClassifier(random_state=0, class_weight=class_weight))
    ])

def get_logistic_regression_classifier(k, class_weight=None):
    return Pipeline([
        ('preprocessing', get_preprocessing_pipeline(k)),
        ('classifier', LogisticRegression(random_state=0, max_iter=50, penalty='l2', class_weight=class_weight))
    ])

# Function to create ensemble classifier with specified k and class weights
def ensemble_classifiers(k, class_weight=None):
    linear_svm_classifier = get_linear_svm_classifier(k, class_weight=class_weight)
    svm_classifier = get_svm_classifier(k, class_weight=class_weight)
    decision_tree_classifier = get_decision_tree_classifier(k, class_weight=class_weight)
    logistic_regression_classifier = get_logistic_regression_classifier(k, class_weight=class_weight)

    return VotingClassifier(estimators=[
        ("linear_svm_classifier", linear_svm_classifier),
        ("svm_classifier", svm_classifier),
        ("decision_tree_classifier", decision_tree_classifier),
        ("logistic_regression_classifier", logistic_regression_classifier)
    ])

In [49]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import numpy as np
num_features = len(train_dataset[0]['text'].split())
k = min(num_features, 10000)


# Define the class labels
class_labels = ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']

# Map class labels to their corresponding indices
class_indices = {label: index for index, label in enumerate(class_labels)}

# Extract target emotions from the train_dataset
emotions = [item['emotion'] for item in train_dataset]

# Convert emotions to class indices
class_indices_array = np.array([class_indices[emotion] for emotion in emotions])

# Calculate class weights based on the inverse of class frequencies
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(class_indices_array), y=class_indices_array)

label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(emotions)

# class_weight_dict = {class_labels[i]: weight for i, weight in enumerate(class_weights)}
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
# Create a dictionary mapping class labels to class weights
# class_weight_dict = dict(zip(class_labels, class_weights))

print("Class Weights:", class_weight_dict)


model = ensemble_classifiers(k,class_weight = class_weight_dict)
# Training loop
for batch in train_dataloader:
    texts = batch['text']
    emotions = batch['emotion']
    model.fit(texts, emotions)

# Validation loop
correct = 0
total = 0
with torch.no_grad():
    for batch in val_dataloader:
        texts = batch['text']
        emotions = batch['emotion']
        predicted_emotions = model.predict(texts)
        correct += (predicted_emotions == emotions).sum().item()
        total += len(emotions)

accuracy = correct / total
print(f'Validation Accuracy: {accuracy:.4f}')

Class Weights: {0: 0.3064621284755513, 1: 0.8090513313759239, 2: 1.1672509494595384, 3: 1.2928328749393303, 4: 2.060598246518824, 5: 5.28505291005291, 6: 5.096301020408164}
Validation Accuracy: 0.4850


In [50]:
def predict_emotion(msgs):
    emotion_label = model.predict(msgs)
    return emotion_label

In [51]:
msgs = ["I'm feeling happy today", "sad"]
predicted_emotions = predict_emotion(msgs)
print(predicted_emotions)

['neutral' 'neutral']
