In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import tiktoken

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

In [2]:
dataset = load_dataset("gustavecortal/DreamBank-annotated")
df = dataset["train"].to_pandas()
df = df.sample(frac = 1)
df["report"] = df["report"].astype(str)
df = df.query("emotion == 'CO D' or emotion == 'AN D' or emotion == 'SD D' or emotion == 'AP D' or emotion == 'HA D'")[["report", "emotion"]]
df.head()

Unnamed: 0,report,emotion
8872,Flames of light pointing downwards and I had b...,CO D
4546,"[""Villainous behavior.""] I am in a family. I a...",AP D
18074,<i>Dead Flaming Robin</i> <br/><br/> I'm hunti...,AP D
23375,I was washing in a bathroom at what in the dre...,AP D
20917,I was talking to my Modern French instructor a...,SD D


In [4]:
def to_emotion_class(emotion):
    
    if emotion == "AP D":
        return "fear"
    elif emotion == "CO D":
        return "confusion"
    elif emotion == "SD D":
        return "sadness"
    elif emotion == "AN D":
        return "anger"
    else:
        return "happiness"

df["emotion"] = df["emotion"].apply(lambda x: to_emotion_class(x))
df["report"] = df["report"].apply(lambda x: x.lower()) # simple preprocessing (lowercase only)
df.head()

Unnamed: 0,report,emotion
8872,flames of light pointing downwards and i had b...,confusion
4546,"[""villainous behavior.""] i am in a family. i a...",fear
18074,<i>dead flaming robin</i> <br/><br/> i'm hunti...,fear
23375,i was washing in a bathroom at what in the dre...,fear
20917,i was talking to my modern french instructor a...,sadness


In [5]:
X = df['report']
y = df['emotion']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [13]:
def lemma_tokenize(doc):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in word_tokenize(doc)]

def char_tokenize(doc):
    return [char for char in doc]

def byte_tokenize(doc):
    tokens = doc.encode("utf-8")
    tokens = list(map(int, tokens))
    return [str(token) for token in tokens]

def gpt_tokenize(doc):
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [34]:
model_L2 = make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression(penalty='l2', solver='saga', max_iter=100))
model_L1 = make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression(penalty='l1', solver='saga', max_iter=100))
model_None = make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression(penalty=None, solver='saga', max_iter=100))

In [35]:
model_L2.fit(X_train, y_train)
model_L1.fit(X_train, y_train)
model_None.fit(X_train, y_train)
#model_L2.fit(X_train, y_train, multinomialnb__sample_weight=sample_weights)

y_pred_L2 = model_L2.predict(X_test)

print("}>-===== L2 =====-<{")
print(f"Features dimension: {len(X_train), len(model_L2[0].vocabulary_)}")
print("Classification Report:\n", classification_report(y_test, y_pred_L2))

y_pred_L1 = model_L1.predict(X_test)

print("}>-===== L1 =====-<{")
print(f"Features dimension: {len(X_train), len(model_L1[0].vocabulary_)}")
print("Classification Report:\n", classification_report(y_test, y_pred_L1))

y_pred_None = model_None.predict(X_test)

print("}>-===== None =====-<{")
print(f"Features dimension: {len(X_train), len(model_None[0].vocabulary_)}")
print("Classification Report:\n", classification_report(y_test, y_pred_None))



}>-===== L2 =====-<{
Features dimension: (4520, 21329)
Classification Report:
               precision    recall  f1-score   support

       anger       0.67      0.52      0.58       165
   confusion       0.55      0.43      0.49       219
        fear       0.59      0.83      0.69       422
   happiness       0.62      0.54      0.58       220
     sadness       0.57      0.26      0.36       104

    accuracy                           0.60      1130
   macro avg       0.60      0.52      0.54      1130
weighted avg       0.60      0.60      0.58      1130

}>-===== L1 =====-<{
Features dimension: (4520, 21329)
Classification Report:
               precision    recall  f1-score   support

       anger       0.66      0.50      0.57       165
   confusion       0.55      0.43      0.48       219
        fear       0.59      0.83      0.69       422
   happiness       0.60      0.54      0.56       220
     sadness       0.59      0.25      0.35       104

    accuracy               

In [36]:
# Perform cross-validation and print the mean accuracy
scoring = 'f1_macro'
scores = cross_val_score(model_L2, X, y, cv=5, scoring=scoring, n_jobs = -1)
print(f"Mean {scoring}: {scores.mean()}")
print(f"Standard deviation {scoring}: {scores.std()}")



Mean f1_macro: 0.5289123442597088
Standard deviation f1_macro: 0.018532150162908947


In [62]:
grid=[{}, {"solver":["saga"], "penalty":["l1","l2", None], "max_iter":[100,150,200]}]
#, "C":np.logspace(-3,3,7)
logreg=make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression())
#logreg
logreg_cv=GridSearchCV(logreg, n_jobs=1, param_grid=grid)

logreg_cv.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

ValueError: Invalid parameter 'max_iter' for estimator Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression())]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [59]:
logreg_cv