In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import tiktoken

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

In [5]:
dataset = load_dataset("gustavecortal/DreamBank-annotated")
df = dataset["train"].to_pandas()
df = df.sample(frac = 1)
df["report"] = df["report"].astype(str)
df = df.query("emotion == 'CO D' or emotion == 'AN D' or emotion == 'SD D' or emotion == 'AP D' or emotion == 'HA D'")[["report", "emotion"]]
df.head()

Unnamed: 0,report,emotion
5835,"[1997-07-27] I'm back at my old job again, the...",AN D
19460,My boyfriend Jeremy came in the room wearing a...,SD D
3677,I and some other women are in a room. I see wa...,AP D
15867,It kept going back and forwards between real l...,HA D
6094,I am walking down a New York City street. Day ...,AN D


In [6]:
def to_emotion_class(emotion):
    
    if emotion == "AP D":
        return "fear"
    elif emotion == "CO D":
        return "confusion"
    elif emotion == "SD D":
        return "sadness"
    elif emotion == "AN D":
        return "anger"
    else:
        return "happiness"

df["emotion"] = df["emotion"].apply(lambda x: to_emotion_class(x))
df["report"] = df["report"].apply(lambda x: x.lower()) # simple preprocessing (lowercase only)
df.head()

Unnamed: 0,report,emotion
5835,"[1997-07-27] i'm back at my old job again, the...",anger
19460,my boyfriend jeremy came in the room wearing a...,sadness
3677,i and some other women are in a room. i see wa...,fear
15867,it kept going back and forwards between real l...,happiness
6094,i am walking down a new york city street. day ...,anger


In [7]:
X = df['report']
y = df['emotion']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [9]:
def lemma_tokenize(doc):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in word_tokenize(doc)]

def char_tokenize(doc):
    return [char for char in doc]

def byte_tokenize(doc):
    tokens = doc.encode("utf-8")
    tokens = list(map(int, tokens))
    return [str(token) for token in tokens]

def gpt_tokenize(doc):
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [10]:
model_L2 = make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression(penalty='l2', solver='saga', max_iter=1000))
model_L1 = make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression(penalty='l1', solver='saga', max_iter=100))
model_None = make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression(penalty=None, solver='saga', max_iter=100))

In [11]:
model_L2.fit(X_train, y_train)
model_L1.fit(X_train, y_train)
model_None.fit(X_train, y_train)
#model_L2.fit(X_train, y_train, multinomialnb__sample_weight=sample_weights)

y_pred_L2 = model_L2.predict(X_test)

print("}>-===== L2 =====-<{")
print(f"Features dimension: {len(X_train), len(model_L2[0].vocabulary_)}")
print("Classification Report:\n", classification_report(y_test, y_pred_L2))

y_pred_L1 = model_L1.predict(X_test)

print("}>-===== L1 =====-<{")
print(f"Features dimension: {len(X_train), len(model_L1[0].vocabulary_)}")
print("Classification Report:\n", classification_report(y_test, y_pred_L1))

y_pred_None = model_None.predict(X_test)

print("}>-===== None =====-<{")
print(f"Features dimension: {len(X_train), len(model_None[0].vocabulary_)}")
print("Classification Report:\n", classification_report(y_test, y_pred_None))



}>-===== L2 =====-<{
Features dimension: (4520, 21374)
Classification Report:
               precision    recall  f1-score   support

       anger       0.61      0.47      0.53       159
   confusion       0.58      0.42      0.49       209
        fear       0.58      0.86      0.69       434
   happiness       0.62      0.48      0.55       225
     sadness       0.46      0.13      0.20       103

    accuracy                           0.58      1130
   macro avg       0.57      0.47      0.49      1130
weighted avg       0.58      0.58      0.56      1130

}>-===== L1 =====-<{
Features dimension: (4520, 21374)
Classification Report:
               precision    recall  f1-score   support

       anger       0.60      0.48      0.53       159
   confusion       0.60      0.43      0.50       209
        fear       0.57      0.86      0.69       434
   happiness       0.62      0.48      0.54       225
     sadness       0.46      0.13      0.20       103

    accuracy               

In [12]:
# Perform cross-validation and print the mean accuracy
scoring = 'f1_macro'
scores = cross_val_score(model_L2, X, y, cv=5, scoring=scoring, n_jobs = -1)
print(f"Mean {scoring}: {scores.mean()}")
print(f"Standard deviation {scoring}: {scores.std()}")

Mean f1_macro: 0.47819854472901396
Standard deviation f1_macro: 0.024487102856183055




In [13]:
"""grid= {
    "logisticregression__solver":["saga"],
    "logisticregression__penalty":["l1","l2", None],
    "logisticregression__max_iter":[100,150,200]
}

logreg=make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression())

logreg_cv=GridSearchCV(logreg, n_jobs=1, param_grid=grid)

logreg_cv.fit(X_train,y_train)"""

'grid= {\n    "logisticregression__solver":["saga"],\n    "logisticregression__penalty":["l1","l2", None],\n    "logisticregression__max_iter":[100,150,200]\n}\n\nlogreg=make_pipeline(CountVectorizer(ngram_range = (1,1)), LogisticRegression())\n\nlogreg_cv=GridSearchCV(logreg, n_jobs=1, param_grid=grid)\n\nlogreg_cv.fit(X_train,y_train)'

In [14]:
# logreg_cv.best_params_

In [15]:
# logreg_cv.best_score_

# Word2Vec

In [39]:
df['tokenized_report'] = df.apply(lambda row: word_tokenize(row['report']), axis=1)
texts = df['tokenized_report'].values.tolist()
model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, workers=4)
#model.train([["hello", "world"]], total_examples=1, epochs=1)

In [40]:
model.wv.most_similar('car', topn=10)

[('bed', 0.8157466053962708),
 ('bedroom', 0.7590041160583496),
 ('house', 0.7586585879325867),
 ('seat', 0.7580720782279968),
 ('truck', 0.7554482817649841),
 ('back', 0.7514362335205078),
 ('bathroom', 0.7390074729919434),
 ('bus', 0.7113147377967834),
 ('kitchen', 0.7026335000991821),
 ('van', 0.6943224668502808)]

# tf-idf + reg log

In [None]:
def tfidf(words):
    tfidf = TfidfVectorizer()
    data = tfidf.fit_transform(words)
    return tfidf, data

tfidf_est, x_train_tfidf = tfidf(X_train.tolist())
x_test_tfidf = tfidf_est.transform(X_test.tolist())

lr_idf = LogisticRegression(penalty='l2', solver='saga', max_iter=1000)
lr_idf.fit(x_train_tfidf, y_train)
y_lr = lr_idf.predict(x_test_tfidf)
print(classification_report(y_test, y_lr))

# w2v + reg log