In [22]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, GridSearchCV
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import neattext.functions as nfx
# # Download NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

**MELD**

In [45]:
csv_file_path = r'D:\College\Fourth Year\GP\Meld\train_sent_emo.csv'
df = pd.read_csv(csv_file_path)
df.isnull().sum()
# df['Emotion'].value_counts().plot(kind='bar')
df['Clean_Text'] = df['Utterance'].apply(nfx.remove_multiple_spaces)
df['Clean_Text'] = df['Utterance'].apply(nfx.remove_bad_quotes)
df['Clean_Text'] = df['Utterance'].apply(nfx.remove_special_characters)
df[['Utterance','Clean_Text']]

Unnamed: 0,Utterance,Clean_Text
0,also I was the point person on my companys tr...,also I was the point person on my companys tra...
1,You mustve had your hands full.,You mustve had your hands full
2,That I did. That I did.,That I did That I did
3,So lets talk a little bit about your duties.,So lets talk a little bit about your duties
4,My duties? All right.,My duties All right
...,...,...
9984,You or me?,You or me
9985,"I got it. Uh, Joey, women don't have Adam's ap...",I got it Uh Joey women dont have Adams apples
9986,"You guys are messing with me, right?",You guys are messing with me right
9987,Yeah.,Yeah


In [47]:
# Split the data into features (X) and labels (y)
X = df['Clean_Text']
y = df['Emotion']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
def update_class_weight(y_train):
    # Define the class labels
    class_labels = ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']

    # Map class labels to their corresponding indices
    class_indices = {label: index for index, label in enumerate(class_labels)}

    # Convert emotions to class indices
    class_indices_array = np.array([class_indices[emotion] for emotion in y_train])

    # Calculate class weights based on the inverse of class frequencies
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(class_indices_array), y=class_indices_array)
    label_encoder = LabelEncoder()
    integer_labels = label_encoder.fit_transform(y_train)
    class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

    return class_weight_dict,class_indices_array
    

In [51]:
def tune_hyperparameters(model, param_grid, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(model, param_grid, cv=5, verbose=3, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print("Best parameters found:")
    print(grid_search.best_params_)
    
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Best model accuracy:", accuracy)
    
    return best_model

In [52]:
class_labels = ['neutral', 'joy', 'surprise', 'anger', 'sadness', 'disgust', 'fear']
class_weight,y_train_indices = update_class_weight(y_train)

tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear',class_weight=class_weight)

# Fit the SVM classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train_indices)

# Predict the labels for the testing data
y_pred = svm_classifier.predict(X_test_tfidf)

class_indices = {label: index for index, label in enumerate(class_labels)}

# Convert emotions to class indices
class_indices_array = np.array([class_indices[emotion] for emotion in y_test])

# Calculate the accuracy of the model
accuracy = accuracy_score(class_indices_array, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.3958958958958959


In [57]:
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
best_model = tune_hyperparameters(svm_classifier, param_grid, X_train_tfidf, y_train_indices, X_test_tfidf, class_indices_array)
best_model.fit(X_train_tfidf, y_train_indices)
# Predict the labels for the testing data
y_pred = svm_classifier.predict(X_test_tfidf)

class_indices = {label: index for index, label in enumerate(class_labels)}

# Convert emotions to class indices
class_indices_array = np.array([class_indices[emotion] for emotion in y_test])

# Calculate the accuracy of the model
accuracy = accuracy_score(class_indices_array, y_pred)
print("Accuracy:", accuracy)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters found:
{'C': 10.0, 'gamma': 'scale', 'kernel': 'rbf'}
Best model accuracy: 0.5145145145145145
Accuracy: 0.3958958958958959


In [53]:
logistic_regression = LogisticRegression(class_weight=class_weight)
logistic_regression.fit(X_train_tfidf, y_train_indices)
y_pred2 = logistic_regression.predict(X_test_tfidf)
accuracy = accuracy_score(class_indices_array, y_pred2)
print("Accuracy:", accuracy)

Accuracy: 0.36436436436436437


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l2']
}

# Perform hyperparameter tuning
best_model = tune_hyperparameters(logistic_regression, param_grid, X_train_tfidf, y_train_indices, X_test_tfidf, class_indices_array)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best parameters found:
{'C': 10.0, 'penalty': 'l2'}
Best model accuracy: 0.3953953953953954


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
decision_tree = DecisionTreeClassifier(class_weight=class_weight)
decision_tree.fit(X_train_tfidf, y_train_indices)
y_pred3 = decision_tree.predict(X_test_tfidf)
accuracy = accuracy_score(class_indices_array, y_pred3)
print("Accuracy:", accuracy)

Accuracy: 0.3813813813813814


In [50]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])
pipe_lr.fit(X_train,y_train)
pipe_lr.score(X_test,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.534034034034034

**DAILY DIALOGUE**

In [12]:
!pip install --upgrade --force-reinstall spellchecker

Collecting spellchecker
  Using cached spellchecker-0.4-py3-none-any.whl
Collecting setuptools (from spellchecker)
  Using cached setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting inexactsearch (from spellchecker)
  Using cached inexactsearch-1.0.2-py3-none-any.whl
Collecting soundex>=1.0 (from inexactsearch->spellchecker)
  Using cached soundex-1.1.3-py3-none-any.whl
Collecting silpa-common>=0.3 (from inexactsearch->spellchecker)
  Using cached silpa_common-0.3-py3-none-any.whl
Using cached setuptools-69.5.1-py3-none-any.whl (894 kB)
Installing collected packages: silpa-common, setuptools, soundex, inexactsearch, spellchecker
  Attempting uninstall: silpa-common
    Found existing installation: silpa_common 0.3
    Uninstalling silpa_common-0.3:
      Successfully uninstalled silpa_common-0.3
  Attempting uninstall: setuptools
    Found existing installation: setuptools 69.5.1
    Uninstalling setuptools-69.5.1:
      Successfully uninstalled setuptools-69.5.1
  Attempti

In [1]:
from typing import List, Tuple, Callable, Dict
import argparse
import random
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import torch.autograd
import torch.optim as optim
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score, classification_report, confusion_matrix)
import numpy as np
import contractions
import unicodedata
from bs4 import BeautifulSoup
import emoji
import re
import pickle
import os
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Stopword removal
stop_words = set(stopwords.words('english'))
# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\emans\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
dataset_path = "D:/College/Fourth Year/GP/dailydialog"

In [3]:
train_data_path = f"{dataset_path}/train/dialogues_train.txt"
train_label_path = f"{dataset_path}/train/dialogues_emotion_train.txt"
test_data_path = f"{dataset_path}/test/dialogues_test.txt"
test_label_path = f"{dataset_path}/test/dialogues_emotion_test.txt"

In [4]:
def lower_sentence(sentence: str) -> str:
    '''
    Lowercase the sentence.
    :param data: The sentence to lowercase.
    :return: The lowercased sentence
    :rtype: str
    '''
    return sentence.lower()

In [5]:
def remove_emails(sentence: str) -> str:
    '''
    Remove emails from the sentence.
    :param sentence: The sentence to remove emails from.
    :type sentence: str
    :return: The sentence without emails.
    :rtype: str
    '''
    return re.sub(r"\S*@\S*\s?", "", sentence)

In [6]:
def remove_nonascii_diacritic(sentence: str) -> str:
    '''

    Remove diacritics from the sentence.

    :param sentence: The sentence to remove diacritics from.

    :type sentence: str

    :return: The sentence without diacritics.

    :rtype: str
    '''

    return unicodedata.normalize("NFKD", sentence).encode("ascii", "ignore").decode("utf-8", "ignore")

In [7]:
def clean_html(sentence: str) -> str:
    '''
    Remove HTML tags from the sentence.
    :param sentence: The sentence to remove HTML tags from.
    :type sentence: str
    :return: The sentence without HTML tags.
    :rtype: str
    '''
    return BeautifulSoup(sentence, "html.parser").get_text()

In [8]:
def replace_repeated_chars(sentence: str) -> str:
    '''
    Replace repeated characters in the sentence.
    :param sentence: The sentence to replace repeated characters in.
    :type sentence: str
    :return: The sentence with replaced repeated characters.
    :rtype: str
    '''
    # Replace consecutive occurrences of ',', '!', '.', and '?' with a single occurrence
    return re.sub(r'([,!?.])\1+', r'\1', sentence)

In [9]:
def translate_emojis_to_text(sentence: str) -> str:
    '''
    Translate emojis in the sentence to text.
    :param sentence: The sentence to translate emojis to text.
    :type sentence: str
    :return: The sentence with translated emojis to text.
    :rtype: str
    '''
    # Translate emojis to text codes
    translated_text = emoji.demojize(sentence)
    # Remove colons from the translated text
    translated_text = re.sub(r':', '', translated_text)
    return translated_text

In [10]:
def expand_sentence(sentence: str) -> str:
    '''
    Expand the contractions in the sentence.
    :param sentence: The sentence to expand contractions in.
    :type sentence: str
    :return: The sentence with expanded contractions.
    :rtype: str
    '''
    return contractions.fix(sentence)

In [11]:
def remove_url(sentence: str) -> str:
    '''
    Remove URLs from the sentence.
    :param sentence: The sentence to remove URLs from.
    :type sentence: str
    :return: The sentence without URLs.
    :rtype: str
    '''
    return re.sub("((http\://|https\://|ftp\://)|(www.))+(([a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(/[a-zA-Z0-9%:/-_\?\.'~]*)?", '', sentence)

In [12]:
def remove_possessives(sentence: str) -> str:
    '''
    Strip possessives from the sentence.
    :param sentence: The sentence to strip possessives from.
    :type sentence: str
    :return: The sentence without possessives.
    :rtype: str
    '''
    # Stripping the possessives
    sentence = sentence.replace("'s", '')
    sentence = sentence.replace('’s', '')
    sentence = sentence.replace('s’', 's')
    sentence = sentence.replace("s'", 's')
    return sentence

In [13]:
def remove_extra_space(sentence: str) -> str:
    '''
    Remove extra spaces from the sentence.
    :param sentence: The sentence to remove extra spaces from.
    :type sentence: str
    :return: The sentence without extra spaces.
    :rtype: str
    '''
    return re.sub(r'\s+', ' ', sentence).strip()

In [14]:
def tokenize_sentence(sentence: str) -> list[str]:
    '''
    Tokenize the sentence.
    :param sentence: The sentence to tokenize.
    :type sentence: str
    :return: The tokenized sentence.
    :rtype: str
    '''
    return nltk.word_tokenize(sentence)

In [15]:
def remove_stop_words(sentence: list[str]) -> list[str]:
    '''
    Remove stop words from the sentence.
    :param sentence: The sentence to remove stop words from.
    :type sentence: list[str]
    :return: The sentence without stop words.
    :rtype: list[str]
    '''
    return [word for word in sentence if word not in stop_words]

In [16]:
def lemm_sentence(sentence: list[str]) -> list[str]:
    '''
    Lemmatize the sentence.
    :param sentence: The sentence to lemmatize.
    :type sentence: list[str]
    :return: The lemmatized sentence.
    :rtype: list[str]
    '''
    # Perform POS tagging
    pos_tags = pos_tag(sentence)
    # Lemmatize each word based on its POS tag
    lemmatized_words = []
    for word, pos in pos_tags:
        # Map Penn Treebank POS tags to WordNet POS tags
        if pos.startswith('N'):  # Nouns
            pos = 'n'
        elif pos.startswith('V'):  # Verbs
            pos = 'v'
        elif pos.startswith('J'):  # Adjectives
            pos = 'a'
        elif pos.startswith('R'):  # Adverbs
            pos = 'r'
        else:
            pos = 'n'  # Default to noun if POS tag not found

        # Lemmatize the word using the appropriate POS tag
        lemma = lemmatizer.lemmatize(word, pos=pos)
        lemmatized_words.append(lemma)
    return lemmatized_words

In [17]:
def clean_train(line: str) -> list[str]:
    '''
    Clean the line and return it as a list of tokens
    :param line: the line to clean
    :type line: str
    :return: the cleaned line as a list of tokens
    :rtype: list
    '''
    # translate emojis
    line = translate_emojis_to_text(line)
    # lower the line
    line = lower_sentence(line)
    # remove non ascii
    line = remove_nonascii_diacritic(line)
    # remove emails
    line = remove_emails(line)
    # remove html
    line = clean_html(line)
    # remove urls
    line = remove_url(line)
    # replace repeated chars
    line = replace_repeated_chars(line)
    # expand
    line = expand_sentence(line)
    # remove possessives
    line = remove_possessives(line)
    # remove extra spaces
    line = remove_extra_space(line)
    # tekonize
    line = tokenize_sentence(line)
    # remove stopwords
    line = remove_stop_words(line)
    # lemmetization
    line = lemm_sentence(line)
    if len(line) == 0:
        return ['Normal']
    return line

In [18]:
def read_dialogue_data(file_path: str) -> List[List[str]]:
    '''
    Read the dialogue data from the file path.
    :param file_path: The path of the file.
    :type file_path: str
    :return: A list of dialogues, where each dialogue is a list of sentences,
             and each sentence is a string.
    :rtype: list
    '''
    # define dialogues list
    dialogues = []
    # read data file
    with open(file_path, "r", encoding="utf8") as file_data:
        dialogues = [
            [
                sentence.replace(".", " . ").replace("?", " ? ").replace("!", " ! ").replace(
                    ";", " ; ").replace(":", " : ").replace(",", " , ").strip()
                for sentence in line.split("__eou__") if sentence.strip()
            ]
            for line in file_data
        ]
    return dialogues

In [19]:

def read_dataset_dailyDialog(data_path: str, label_path: str) -> Tuple[List[List[List[str]]], List[List[int]]]:
    '''
    Take the data path and the label path and read them.
    It then splits the conversations and extracts each conversation, sentence, and words of each sentence.
    It reads the labels of each sentence in the conversation

    :param data_path: The path of the conversations.
    :type data_path: str
    :param label_path: The path of the labels for the conversations.
    :type label_path: str
    :return: A tuple containing inputs and targets.
             inputs: List of conversations, where each conversation is a list of sentences,
                     and each sentence is a list of words.
             targets: List of labels for each conversation.
    :rtype: tuple
    '''
    # define targets list
    targets = []
    # read labels file
    with open(label_path, "r", encoding="utf8") as file_data:
        targets = [[int(label) for label in line.strip(
            "\n").strip(" ").split(" ")] for line in file_data]
        # for loop version
        # for line in file_data:
        #     labels = [int(label) for label in line.strip("\n").strip(" ").split(" ")]
        #     targets.append(labels)

    # read data file
    dialogues = read_dialogue_data(data_path)
    # define inputs list
    inputs = [
        [
            clean_train(sentence) for sentence in dialogue
        ]
        for dialogue in dialogues
    ]
    return (inputs, targets)

In [20]:
X_train, y_train = read_dataset_dailyDialog(train_data_path, train_label_path)
print(len(X_train))
print(len(y_train))

  return BeautifulSoup(sentence, "html.parser").get_text()


11118
11118


In [21]:
X_test, y_test = read_dataset_dailyDialog(test_data_path, test_label_path)

  return BeautifulSoup(sentence, "html.parser").get_text()


In [60]:
print(type(X_train))
print(y_train[:1])

<class 'list'>
[[0, 0, 0, 0, 0, 0, 4, 4, 4, 4]]


In [54]:
# Assuming X_train and X_test are lists of lists
# Flatten X_train into a list of strings
flattened_X_train = [' '.join(sentence) for dialog in X_train for sentence in dialog]
# Flatten X_train into a list of strings
flattened_X_test = [' '.join(sentence) for dialog in X_test for sentence in dialog]

# Flatten y_train into a single list
y_train_flat = [label for dialog_labels in y_train for label in dialog_labels]
y_test_flat = [label for dialog_labels in y_test for label in dialog_labels]


tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(flattened_X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(flattened_X_test)

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')

# Fit the SVM classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train_flat)

# Predict the labels for the testing data
y_pred = svm_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test_flat, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.842764857881137


In [37]:
def save_model(model, save_path):
    """
    Save a trained model to a file using pickle.

    Parameters:
    - model: Trained model object
    - save_path: File path to save the model
    """
    with open(save_path, 'wb') as f:
        pickle.dump(model, f)
    print("Model saved to:", save_path)

In [56]:
save_model(svm_classifier,"D:/College/Fourth Year/GP/Soul-AI/models/mental_health_ML")

Model saved to: D:/College/Fourth Year/GP/Soul-AI/models/mental_health_ML


In [None]:
# no emotion (0), anger (1), disgust (2), fear (3), happiness (4), sadness (5) and surprise (6)

In [69]:
# Example of a new sentence
new_sentence = "I don't know . My life is a big mess ."

preprocessed_text = clean_train(new_sentence)

# Transform the preprocessed sentence into TF-IDF representation
tfidf_vectorized_sentence = tfidf_vectorizer.transform(preprocessed_text)

# Predict the label for the TF-IDF transformed sentence
predicted_label = svm_classifier.predict(tfidf_vectorized_sentence)

print("Predicted label:", predicted_label)


Predicted label: [0 0 0 0 0 0]


In [70]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_flat), y=y_train_flat)

# Convert class weights to a dictionary
class_weight_dict = dict(zip(np.unique(y_train_flat), class_weights))

tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(flattened_X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(flattened_X_test)

# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear', class_weight=class_weight_dict)

# Fit the SVM classifier on the TF-IDF transformed training data
svm_classifier.fit(X_train_tfidf, y_train_flat)

# Predict the labels for the testing data
y_pred = svm_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test_flat, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6600775193798449


In [71]:
save_model(svm_classifier,"D:/College/Fourth Year/GP/Soul-AI/models/mental_health_ML_weighted")

Model saved to: D:/College/Fourth Year/GP/Soul-AI/models/mental_health_ML_weighted


In [74]:
# Example of a new sentence
new_sentence = "sad"

preprocessed_text = clean_train(new_sentence)
print(preprocessed_text)
# Transform the preprocessed sentence into TF-IDF representation
tfidf_vectorized_sentence = tfidf_vectorizer.transform(preprocessed_text)

# Predict the label for the TF-IDF transformed sentence
predicted_label = svm_classifier.predict(tfidf_vectorized_sentence)

print("Predicted label:", predicted_label)

['sad']
Predicted label: [5]


In [23]:
# Flatten X_train into a list of strings
flattened_X_train = [' '.join(sentence) for dialog in X_train for sentence in dialog]
# Flatten X_train into a list of strings
flattened_X_test = [' '.join(sentence) for dialog in X_test for sentence in dialog]

# Flatten y_train into a single list
y_train_flat = [label for dialog_labels in y_train for label in dialog_labels]
y_test_flat = [label for dialog_labels in y_test for label in dialog_labels]

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_flat), y=y_train_flat)

# Convert class weights to a dictionary
class_weight_dict = dict(zip(np.unique(y_train_flat), class_weights))

logistic_regression = LogisticRegression(class_weight=class_weight_dict)
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(flattened_X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(flattened_X_test)

logistic_regression.fit(X_train_tfidf, y_train_flat)
y_pred2 = logistic_regression.predict(X_test_tfidf)
accuracy = accuracy_score(y_test_flat, y_pred2)
print("Accuracy:", accuracy)

report = classification_report(y_test_flat, y_pred2)
print(report)

Accuracy: 0.6076227390180878
              precision    recall  f1-score   support

           0       0.93      0.61      0.73      6321
           1       0.11      0.50      0.18       118
           2       0.08      0.32      0.13        47
           3       0.08      0.59      0.14        17
           4       0.40      0.64      0.49      1019
           5       0.12      0.58      0.20       102
           6       0.10      0.55      0.18       116

    accuracy                           0.61      7740
   macro avg       0.26      0.54      0.29      7740
weighted avg       0.81      0.61      0.67      7740



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
emotion_map = {
    0: 'neutral',
    1: 'anger',
    2: 'disgust',
    3: 'fear',
    4: 'happiness',
    5: 'sadness',
    6: 'surprise'
}

In [35]:
# Example of a new sentence
new_sentence = "Okay, I'll give it a try. But I'm still scared and doubtful."

preprocessed_text = clean_train(new_sentence)
print(preprocessed_text)
# Transform the preprocessed sentence into TF-IDF representation
tfidf_vectorized_sentence = tfidf_vectorizer.transform([" ".join(preprocessed_text)])

# Predict the label for the TF-IDF transformed sentence
predicted_emotion = logistic_regression.predict(tfidf_vectorized_sentence)[0]

class_probabilities = logistic_regression.predict_proba(tfidf_vectorized_sentence)[0]


emotion_probabilities = {emotion_map[i]: prob for i, prob in enumerate(class_probabilities)}

# Output the probabilities for each class
for emotion, probability in emotion_probabilities.items():
    print(f"Probability of {emotion}: {probability}")

# Get the predicted emotion (the one with the highest probability)
# predicted_emotion = max(emotion_probabilities, key=emotion_probabilities.get)
print("Predicted emotion:", emotion_map[predicted_emotion])

['okay', ',', 'give', 'try', '.', 'still', 'scar', 'doubtful', '.']
Probability of neutral: 0.06441659914938573
Probability of anger: 0.007847991876326342
Probability of disgust: 0.005675938055684583
Probability of fear: 0.8110976504209492
Probability of happiness: 0.06154204997979236
Probability of sadness: 0.020639569339246187
Probability of surprise: 0.028780201178615725
Predicted emotion: fear


In [38]:
save_model(logistic_regression,"D:/College/Fourth Year/GP/Soul-AI/models/mental_health_logistic_regression")

Model saved to: D:/College/Fourth Year/GP/Soul-AI/models/mental_health_logistic_regression


In [39]:
with open('D:/College/Fourth Year/GP/glove_vectors.pkl', 'rb') as f:
    glove_vectors = pickle.load(f)

In [40]:

# Define your preprocessing function using GloVe embeddings
def preprocess_with_glove(sentence, glove_vectors):
    vectorized_sentence = []
    for word in sentence.split():
        if word in glove_vectors:
            vectorized_sentence.append(glove_vectors[word])
    return np.mean(vectorized_sentence, axis=0) if vectorized_sentence else np.zeros_like(list(glove_vectors.values())[0])

# Transform the training data using GloVe embeddings
X_train_glove = [preprocess_with_glove(sentence, glove_vectors) for sentence in flattened_X_train]
X_train_glove = np.vstack(X_train_glove)

# Transform the testing data using GloVe embeddings
X_test_glove = [preprocess_with_glove(sentence, glove_vectors) for sentence in flattened_X_test]
X_test_glove = np.vstack(X_test_glove)

# Define logistic regression model with class weights
logistic_regression = LogisticRegression(class_weight=class_weight_dict)

# Fit logistic regression model
logistic_regression.fit(X_train_glove, y_train_flat)

# Predict on testing data
y_pred_glove = logistic_regression.predict(X_test_glove)

# Calculate accuracy
accuracy_glove = accuracy_score(y_test_flat, y_pred_glove)
print("Accuracy with GloVe embeddings:", accuracy_glove)

# Calculate classification report
report_glove = classification_report(y_test_flat, y_pred_glove)
print("Classification Report with GloVe embeddings:")
print(report_glove)

Accuracy with GloVe embeddings: 0.4904392764857881
Classification Report with GloVe embeddings:
              precision    recall  f1-score   support

           0       0.93      0.47      0.63      6321
           1       0.08      0.39      0.13       118
           2       0.04      0.43      0.07        47
           3       0.02      0.41      0.04        17
           4       0.38      0.59      0.46      1019
           5       0.08      0.52      0.14       102
           6       0.09      0.60      0.16       116

    accuracy                           0.49      7740
   macro avg       0.23      0.49      0.23      7740
weighted avg       0.82      0.49      0.58      7740



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
# Example of a new sentence
new_sentence = " I don't know. I guess I can try. I don't have anything to lose."

# Preprocess the new sentence using GloVe embeddings
preprocessed_sentence = preprocess_with_glove(new_sentence, glove_vectors)

# Reshape the preprocessed sentence to match the expected input shape
preprocessed_sentence = preprocessed_sentence.reshape(1, -1)

# Predict the label for the preprocessed sentence
predicted_label = logistic_regression.predict(preprocessed_sentence)[0]

# Predict the probabilities for each class for the preprocessed sentence
class_probabilities = logistic_regression.predict_proba(preprocessed_sentence)[0]

# Create a dictionary mapping emotions to probabilities
emotion_probabilities = {emotion_map[i]: prob for i, prob in enumerate(class_probabilities)}

# Output the predicted label
print("Predicted label:", emotion_map[predicted_label])

# Output the probabilities for each class
for emotion, probability in emotion_probabilities.items():
    print(f"Probability of {emotion}: {probability}")


Predicted label: sadness
Probability of neutral: 0.18763722940773414
Probability of anger: 0.030659054586403434
Probability of disgust: 0.11225866478299383
Probability of fear: 0.11656221603375616
Probability of happiness: 0.15791877457306758
Probability of sadness: 0.35241141926781683
Probability of surprise: 0.042552641348227956


In [45]:
save_model(logistic_regression,"D:/College/Fourth Year/GP/Soul-AI/models/mental_health_logistic_regression_glv")

Model saved to: D:/College/Fourth Year/GP/Soul-AI/models/mental_health_logistic_regression_glv
