In [13]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
import pandas as pd
from datasets import load_dataset

In [2]:
# Load the GoEmotions dataset
dataset = load_dataset("go_emotions", "raw")


In [6]:
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 211225
    })
})


In [3]:
df = pd.DataFrame(dataset["train"])



In [9]:
df.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
df.shape

(211225, 37)

In [7]:
# All columns except metadata
emotion_columns = [
    col for col in df.columns 
    if col not in [
        'text', 'id', 'author', 'subreddit', 'link_id',
        'parent_id', 'created_utc', 'rater_id', 'example_very_unclear'
    ]
]

print("Emotion labels:")
print(emotion_columns)
print("Total labels:", len(emotion_columns))


Emotion labels:
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
Total labels: 28


In [8]:
emotion_columns = [
    'admiration','amusement','anger','annoyance','approval','caring',
    'confusion','curiosity','desire','disappointment','disapproval',
    'disgust','embarrassment','excitement','fear','gratitude','grief',
    'joy','love','nervousness','optimism','pride','realization',
    'relief','remorse','sadness','surprise','neutral'
]

In [9]:
# Count how many 1s per row
label_counts_per_row = df[emotion_columns].sum(axis=1)

# Check unique values
print("Unique number of active labels per row:")
print(label_counts_per_row.unique())

Unique number of active labels per row:
[ 1  0  2  3  4  7  6  5  9  8 10 12]


In [14]:
#  Extract X and y
texts = df["text"].tolist()
y = df[emotion_columns].values  # Multi-label target matrix


In [15]:
#  Create embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
X = model.encode(texts, show_progress_bar=True)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/6601 [00:00<?, ?it/s]

In [16]:
#  Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [17]:
#  Multi-label classifier
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train, y_train)

0,1,2
,estimator,LogisticRegre...max_iter=1000)
,n_jobs,
,verbose,0

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [18]:
#  Predict
y_pred = clf.predict(X_test)

In [19]:
# Evaluation
print(classification_report(y_test, y_pred, target_names=emotion_columns))

                precision    recall  f1-score   support

    admiration       0.60      0.18      0.28      3456
     amusement       0.58      0.28      0.38      1891
         anger       0.52      0.06      0.10      1628
     annoyance       0.24      0.00      0.00      2722
      approval       0.56      0.01      0.02      3418
        caring       0.45      0.03      0.06      1147
     confusion       0.45      0.02      0.03      1463
     curiosity       0.31      0.02      0.03      1941
        desire       0.45      0.03      0.05       758
disappointment       0.17      0.00      0.00      1671
   disapproval       0.31      0.01      0.01      2289
       disgust       0.55      0.04      0.07      1074
 embarrassment       0.55      0.02      0.05       502
    excitement       0.42      0.01      0.02      1121
          fear       0.56      0.13      0.21       625
     gratitude       0.81      0.54      0.64      2330
         grief       0.33      0.02      0.03  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
import joblib
joblib.dump(clf, 'models/text_emotion_detection_model1.joblib')

['models/text_emotion_detection_model1.joblib']