In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# # Download NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...


True

In [4]:
class MELDDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Utterance']
        emotion = self.data.iloc[idx]['Emotion']
        
        sample = {'text': text, 'emotion': emotion}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

In [5]:
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def __call__(self, sample):
        text = sample['text']
        # Tokenization
        tokens = word_tokenize(text)
        # Lowercasing
        tokens = [token.lower() for token in tokens]
        # Removing punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        # Removing stopwords
        tokens = [token for token in tokens if token not in self.stop_words]
        # Lemmatization
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        # Removing special characters and numbers
        tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens if token]
        # Join tokens back into a string
        preprocessed_text = ' '.join(tokens)

        return {'text': preprocessed_text, 'emotion': sample['emotion']}

In [82]:
csv_file_path = r'D:\College\Fourth Year\GP\Meld\train_sent_emo.csv'
meld_dataset = MELDDataset(csv_file=csv_file_path, transform=TextPreprocessor())

# Split the dataset into training and validation sets
train_dataset, val_dataset = train_test_split(meld_dataset, test_size=0.2, random_state=42)

# Create DataLoader instances for training and validation sets
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [54]:
# Define parameter grids for grid search
param_grids = {
    "linear_svm": {
        'classifier__C': [0.01, 0.1, 1.0, 10.0],
        'classifier__penalty': ['l1', 'l2'],
    },
    "svm": {
        'classifier__C': [0.01, 0.1, 1.0, 10.0],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__gamma': ['scale', 'auto'],
    },
    "decision_tree": {
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
    },
    "logistic_regression": {
        'classifier__C': [0.01, 0.1, 1.0, 10.0],
        'classifier__penalty': ['l1', 'l2'],
    },
    "random_forest": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10],
    },
    "gradient_boosting": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.5],
        'classifier__max_depth': [3, 5, 7],
    }
}


In [88]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt

# Define a TF-IDF vectorizer
def get_tfidf_vectorizer(k=10000):
    return TfidfVectorizer(ngram_range=(1, 2), stop_words="english", sublinear_tf=True,lowercase=False, max_features=k)

# Define a preprocessing pipeline with feature selection
def get_preprocessing_pipeline(vectorizer, feature_selector=None):
    if feature_selector:
        return Pipeline([
            ('features', FeatureUnion([
                ('vectorizer', vectorizer),
                ('feature_selector', feature_selector)
            ]))
        ])
    else:
        return Pipeline([
            ('vectorizer', vectorizer),
        ])

# Function to get a feature selector based on feature importances
def get_feature_selector(k=10000):
    return SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')

# Functions to get different classifiers with specified k and class weights
def get_linear_svm_classifier(k, class_weight=None):
    preprocessing_pipeline = get_preprocessing_pipeline(get_tfidf_vectorizer(k))
    return Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('classifier', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False, class_weight=class_weight))
    ])

def get_svm_classifier(k, class_weight=None):
    preprocessing_pipeline = get_preprocessing_pipeline(get_tfidf_vectorizer(k))
    return Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('classifier', SVC(kernel='rbf', probability=True, class_weight=class_weight))
    ])

def get_decision_tree_classifier(k, class_weight=None):
    preprocessing_pipeline = get_preprocessing_pipeline(get_tfidf_vectorizer(k), get_feature_selector(k))
    return Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('classifier', DecisionTreeClassifier(random_state=0, class_weight=class_weight))
    ])

def get_logistic_regression_classifier(k, class_weight=None):
    preprocessing_pipeline = get_preprocessing_pipeline(get_tfidf_vectorizer(k), get_feature_selector(k))
    return Pipeline([
        ('preprocessing', preprocessing_pipeline),
        ('classifier', LogisticRegression(random_state=0, max_iter=50, penalty='l2', class_weight=class_weight))
    ])

def get_random_forest_classifier():
    return RandomForestClassifier(n_estimators=100, random_state=42)

def get_gradient_boosting_classifier():
    return GradientBoostingClassifier(random_state=42)

# Function to create ensemble classifier with specified k and class weights
def ensemble_classifiers(k, class_weight=None):
    linear_svm_classifier = get_linear_svm_classifier(k, class_weight=class_weight)
    svm_classifier = get_svm_classifier(k, class_weight=class_weight)
    decision_tree_classifier = get_decision_tree_classifier(k, class_weight=class_weight)
    logistic_regression_classifier = get_logistic_regression_classifier(k, class_weight=class_weight)

    return VotingClassifier(estimators=[
        ("linear_svm_classifier", linear_svm_classifier),
        ("svm_classifier", svm_classifier),
        ("decision_tree_classifier", decision_tree_classifier),
        ("logistic_regression_classifier", logistic_regression_classifier)
    ])

# Function for k-fold cross-validation
def cross_validation(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    return scores.mean()

from sklearn.model_selection import GridSearchCV
from scipy.sparse import issparse
# Function for grid search
def grid_search(classifier, param_grid, X_train, y_train):
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Function to train and evaluate the ensemble model
def train_and_evaluate_model2(X_train, y_train, X_val, y_val,class_weight,len_features):
    # Define TF-IDF vectorizer and preprocessing pipeline
    vectorizer = get_tfidf_vectorizer()
    preprocessing_pipeline = get_preprocessing_pipeline(vectorizer)

    # Preprocess the data
    if issparse(X_train):
        X_train_processed = X_train if isinstance(X_train, np.ndarray) else X_train.toarray()
    else:
        X_train_processed = preprocessing_pipeline.fit_transform(X_train)
    
    if issparse(X_val):
        X_val_processed = X_val if isinstance(X_val, np.ndarray) else X_val.toarray()
    else:
        X_val_processed = preprocessing_pipeline.transform(X_val)
    # Get feature names
    # feature_names = vectorizer.get_feature_names()

    # Define classifiers
    classifiers = {
        "linear_svm": get_linear_svm_classifier(len_features,class_weight),
        "svm": get_svm_classifier(len_features,class_weight),
        "decision_tree": get_decision_tree_classifier(len_features,class_weight),
        "logistic_regression": get_logistic_regression_classifier(len_features,class_weight),
        "random_forest": get_random_forest_classifier(),
        "gradient_boosting": get_gradient_boosting_classifier()
    }

    # Train and evaluate each classifier
    best_models = {}
    for name, classifier in classifiers.items():
        # Grid search for hyperparameter tuning
        best_model = grid_search(classifier, param_grids[name], X_train_processed, y_train)
        best_models[name] = best_model

        # Evaluate model using k-fold cross-validation
        cv_score = cross_validation(best_model, X_train_processed, y_train)
        cv_scores[name] = cv_score
        print(f"{name} CV Score: {cv_score:.4f}")

    # Select the best model based on cross-validation scores
    best_model_name = max(best_models, key=lambda k: cv_scores[k])
    best_model = best_models[best_model_name]

    # Train the best model on the entire training set
    best_model.fit(X_train_processed, y_train)

    # Evaluate the best model on the validation set
    val_score = best_model.score(X_val_processed, y_val)
    print(f"Best Model Validation Score: {val_score:.4f}")

    # Feature importance analysis for interpretability
    # if best_model_name in ['random_forest', 'gradient_boosting']:
    #     print(f"Feature Importance Analysis for {best_model_name}:")
    #     feature_importance_analysis(best_model, X_train_processed, feature_names)

    return best_model


In [84]:
def train_and_evaluate_model(train_dataloader, val_dataloader,class_weight,len_features):
    # Define TF-IDF vectorizer and preprocessing pipeline
    vectorizer = get_tfidf_vectorizer()
    preprocessing_pipeline = get_preprocessing_pipeline(vectorizer)

    # Initialize lists to store batch-wise data
    X_train_texts = []
    y_train_emotions = []
    X_val_texts = []
    y_val_emotions = []

    # Iterate through training DataLoader to collect data
    for batch in train_dataloader:
        X_train_texts.extend(batch['text'])
        y_train_emotions.extend(batch['emotion'])

    # Iterate through validation DataLoader to collect data
    for batch in val_dataloader:
        X_val_texts.extend(batch['text'])
        y_val_emotions.extend(batch['emotion'])

    # Convert None values to empty strings
    X_train_texts = [str(text) if text is not None else '' for text in X_train_texts]
    X_val_texts = [str(text) if text is not None else '' for text in X_val_texts]
    y_train_emotions = [str(emotion) if emotion is not None else '' for emotion in y_train_emotions]
    y_val_emotions = [str(emotion) if emotion is not None else '' for emotion in y_val_emotions]

    # Inspect the content
    print("X_train_texts:", X_train_texts[:5])
    print("y_train_emotions:", y_train_emotions[:5])
    X_train_processed = preprocessing_pipeline.fit_transform(X_train_texts)
    X_val_processed = preprocessing_pipeline.transform(X_val_texts)

    # Call the train_and_evaluate_model function with collected data
    best_model = train_and_evaluate_model2(X_train_texts, y_train_emotions, X_val_texts, y_val_emotions,class_weight,len_features)
    return best_model


In [89]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import numpy as np
num_features = len(train_dataset[0]['text'].split())
k = min(num_features, 10000)


# Define the class labels
class_labels = ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']

# Map class labels to their corresponding indices
class_indices = {label: index for index, label in enumerate(class_labels)}

# Extract target emotions from the train_dataset
emotions = [item['emotion'] for item in train_dataset]

# Convert emotions to class indices
class_indices_array = np.array([class_indices[emotion] for emotion in emotions])

# Calculate class weights based on the inverse of class frequencies
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(class_indices_array), y=class_indices_array)

label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(emotions)

# class_weight_dict = {class_labels[i]: weight for i, weight in enumerate(class_weights)}
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
num_features = len(train_dataset[0]['text'].split())
k = min(num_features, 10000)
model = train_and_evaluate_model(train_dataloader, val_dataloader,class_weight_dict,k)


X_train_texts: ['yes uh yes friend eddie moskowitz yeah like reaffirms faith', 'ahh yes', 'hmmmm', 'forget british chippy ', 'yeah']
y_train_emotions: ['neutral', 'neutral', 'neutral', 'sadness', 'surprise']


ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Python\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Python\Lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\sklearn\pipeline.py", line 445, in fit_transform
    return last_step.fit_transform(Xt, y, **fit_params_last_step)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\sklearn\feature_extraction\text.py", line 2133, in fit_transform
    X = super().fit_transform(raw_documents)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\sklearn\feature_extraction\text.py", line 1388, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\sklearn\feature_extraction\text.py", line 1275, in _count_vocab
    for feature in analyze(doc):
                   ^^^^^^^^^^^^
  File "d:\Python\Lib\site-packages\sklearn\feature_extraction\text.py", line 113, in _analyze
    doc = tokenizer(doc)
          ^^^^^^^^^^^^^^
TypeError: expected string or bytes-like object, got 'csr_matrix'


In [49]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import numpy as np
num_features = len(train_dataset[0]['text'].split())
k = min(num_features, 10000)


# Define the class labels
class_labels = ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']

# Map class labels to their corresponding indices
class_indices = {label: index for index, label in enumerate(class_labels)}

# Extract target emotions from the train_dataset
emotions = [item['emotion'] for item in train_dataset]

# Convert emotions to class indices
class_indices_array = np.array([class_indices[emotion] for emotion in emotions])

# Calculate class weights based on the inverse of class frequencies
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(class_indices_array), y=class_indices_array)

label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(emotions)

# class_weight_dict = {class_labels[i]: weight for i, weight in enumerate(class_weights)}
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
# Create a dictionary mapping class labels to class weights
# class_weight_dict = dict(zip(class_labels, class_weights))

print("Class Weights:", class_weight_dict)


model = ensemble_classifiers(k,class_weight = class_weight_dict)
# Training loop
for batch in train_dataloader:
    texts = batch['text']
    emotions = batch['emotion']
    model.fit(texts, emotions)

# Validation loop
correct = 0
total = 0
with torch.no_grad():
    for batch in val_dataloader:
        texts = batch['text']
        emotions = batch['emotion']
        predicted_emotions = model.predict(texts)
        correct += (predicted_emotions == emotions).sum().item()
        total += len(emotions)

accuracy = correct / total
print(f'Validation Accuracy: {accuracy:.4f}')

Class Weights: {0: 0.3064621284755513, 1: 0.8090513313759239, 2: 1.1672509494595384, 3: 1.2928328749393303, 4: 2.060598246518824, 5: 5.28505291005291, 6: 5.096301020408164}
Validation Accuracy: 0.4850


In [50]:
def predict_emotion(msgs):
    emotion_label = model.predict(msgs)
    return emotion_label

In [51]:
msgs = ["I'm feeling happy today", "sad"]
predicted_emotions = predict_emotion(msgs)
print(predicted_emotions)

['neutral' 'neutral']
