### Dataset 1

In [1]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv("../data/relationships/goemotions.csv")

print("Shape:", df.shape)
print(df.columns[:15].tolist())
print(df.head(3))

Shape: (211225, 37)
['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring']
                                                text       id       author  \
0                                    That game hurt.  eew5j0j        Brdd9   
1   >sexuality shouldn’t be a grouping category I...  eemcysk  TheGreen888   
2     You do right, if you don't care then fuck 'em!  ed2mah1     Labalool   

          subreddit    link_id   parent_id   created_utc  rater_id  \
0               nrl  t3_ajis4z  t1_eew18eq  1.548381e+09         1   
1  unpopularopinion  t3_ai4q37   t3_ai4q37  1.548084e+09        37   
2       confessions  t3_abru74  t1_ed2m7g7  1.546428e+09        37   

   example_very_unclear  admiration  ...  love  nervousness  optimism  pride  \
0                 False           0  ...     0            0         0      0   
1                  True           0  ...  

In [2]:
assert "text" in df.columns, "No 'text' column found — adjust path/filename."

exclude = {"text","id","author","subreddit","link_id","parent_id","created_utc","rater_id","example_very_unclear"}
candidate_cols = [c for c in df.columns if c not in exclude]

emotion_cols = []
for c in candidate_cols:
    vals = pd.unique(df[c].dropna())
    if set(vals).issubset({0,1}):

        if df[c].sum() > 0:
            emotion_cols.append(c)

print("Detected emotion columns count:", len(emotion_cols))
print(emotion_cols)

X = df["text"].fillna("").astype(str)
y = df[emotion_cols].astype(int)

print("Features shape:", X.shape)
print("Labels shape:", y.shape)

Detected emotion columns count: 28
['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
Features shape: (211225,)
Labels shape: (211225, 28)


In [3]:
print("Important note:")
print("- y is multi-label (shape:", y.shape, ").")
print("- You CANNOT safely use train_test_split(..., stratify=y) for multi-label targets.")
print("- We'll do a random shuffle split (no stratify). If you need strict multilabel stratification,")
print("  we can use an iterative stratifier (requires extra package).")

Important note:
- y is multi-label (shape: (211225, 28) ).
- You CANNOT safely use train_test_split(..., stratify=y) for multi-label targets.
- We'll do a random shuffle split (no stratify). If you need strict multilabel stratification,
  we can use an iterative stratifier (requires extra package).


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (168980,) Test size: (42245,)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words="english")),
    ("clf", OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight="balanced")))
])

print("Pipeline created:", pipeline)

Pipeline created: Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=10000, ngram_range=(1, 2),
                                 stop_words='english')),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                                  max_iter=1000)))])


In [6]:
pipeline.fit(X_train, y_train)
print("Training complete.")

Training complete.


In [7]:
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, classification_report

y_pred = pipeline.predict(X_test)  

subset_acc = accuracy_score(y_test, y_pred)
hloss = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average="micro")
f1_macro = f1_score(y_test, y_pred, average="macro")

print("Subset accuracy (exact):", subset_acc)
print("Hamming loss (lower better):", hloss)
print("F1 micro:", f1_micro)
print("F1 macro:", f1_macro)

print("\nPer-label classification report:")
print(classification_report(y_test, y_pred, target_names=emotion_cols, zero_division=0))

Subset accuracy (exact): 0.02786128535921411
Hamming loss (lower better): 0.1459428841959319
F1 micro: 0.2780370704941617
F1 macro: 0.2472683446664464

Per-label classification report:
                precision    recall  f1-score   support

    admiration       0.33      0.76      0.46      3456
     amusement       0.36      0.80      0.50      1891
         anger       0.16      0.69      0.26      1628
     annoyance       0.15      0.63      0.25      2722
      approval       0.14      0.54      0.23      3418
        caring       0.11      0.68      0.19      1147
     confusion       0.10      0.56      0.16      1463
     curiosity       0.12      0.59      0.20      1941
        desire       0.10      0.64      0.18       758
disappointment       0.10      0.56      0.17      1671
   disapproval       0.12      0.59      0.21      2289
       disgust       0.12      0.65      0.20      1074
 embarrassment       0.06      0.46      0.10       502
    excitement       0.09     

In [8]:
from sklearn.neighbors import NearestNeighbors
tfidf = pipeline.named_steps["tfidf"]   
X_train_vec = tfidf.transform(X_train)  

nn = NearestNeighbors(n_neighbors=1, metric="cosine").fit(X_train_vec)
print("Built NearestNeighbors on train TF-IDF matrix.")

def retrieve_closest_completion(prompt):
    v = tfidf.transform([prompt])
    idx = nn.kneighbors(v, return_distance=False)[0][0]
    return X_train.iloc[idx], y_train.iloc[idx] 

Built NearestNeighbors on train TF-IDF matrix.


In [9]:
import joblib
os.makedirs("../models", exist_ok=True)

joblib.dump(pipeline, "../models/relationship_emotion_model.pkl")
print("Saved pipeline to ../models/relationship_emotion_model.pkl")

joblib.dump(nn, "../models/relationship_tfidf_knn.pkl")
print("Saved KNN retriever to ../models/relationship_tfidf_knn.pkl")

Saved pipeline to ../models/relationship_emotion_model.pkl
Saved KNN retriever to ../models/relationship_tfidf_knn.pkl


### Dataset 2

In [10]:
import pandas as pd

train_df = pd.read_parquet("../data/relationships/train-00000-of-00001.parquet")
test_df = pd.read_parquet("../data/relationships/test-00000-of-00001.parquet")
val_df = pd.read_parquet("../data/relationships/validation-00000-of-00001.parquet")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Validation shape:", val_df.shape)

print("\nTrain head:")
print(train_df.head())

Train shape: (100000, 2)
Test shape: (6553, 2)
Validation shape: (6447, 2)

Train head:
                                              prompt  \
0  SUBREDDIT: r/relationships\n\nTITLE: I [16/M] ...   
1  SUBREDDIT: r/tifu\n\nTITLE: TIFU by getting to...   
2  SUBREDDIT: r/relationships\n\nTITLE: I [18/F] ...   
3  SUBREDDIT: r/relationships\n\nTITLE: I [22f] a...   
4  SUBREDDIT: r/relationships\n\nTITLE: I [22] am...   

                                          completion  
0   I have a huge crush on my friend's older brot...  
1   I got incredibly high for chapter, thought I ...  
2   Me: "I'm really really falling for you" Boyfr...  
3   Job is starting to suck. Actively searching f...  
4   Guy I like is frequently busy with work and I...  


In [11]:
X_train = train_df["prompt"]
y_train = train_df["completion"]

X_test = test_df["prompt"]
y_test = test_df["completion"]

X_val = val_df["prompt"]
y_val = val_df["completion"]

print("Training features shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
print("Validation features shape:", X_val.shape)
print("Test features shape:", X_test.shape)

Training features shape: (100000,)
Training labels shape: (100000,)
Validation features shape: (6447,)
Test features shape: (6553,)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline

model = Pipeline(steps=[
    ("vectorizer", TfidfVectorizer(max_features=5000, stop_words="english")),
    ("nn", NearestNeighbors(n_neighbors=1, metric="cosine"))
])

In [13]:
model.fit(X_train)

print("Model training complete!")

Model training complete!


In [14]:
def retrieve_completion(prompt, model, X_ref, y_ref):
    """Retrieve the most similar completion for a given prompt."""

    prompt_vec = model.named_steps['vectorizer'].transform([prompt])

    idx = model.named_steps['nn'].kneighbors(prompt_vec, return_distance=False)[0][0]
    return y_ref.iloc[idx]

In [15]:
sample_prompt = X_val.iloc[0]
print("Prompt:\n", sample_prompt)

retrieved = retrieve_completion(sample_prompt, model, X_train, y_train)
print("\nRetrieved completion:\n", retrieved)

print("\nGround truth completion:\n", y_val.iloc[0])

Prompt:
 SUBREDDIT: r/AskReddit

TITLE: How do you get someone out of your head?

POST: Hi,
I'm 22, and I have been with my girlfriend for 5 years now. We recently moved together. We've always loved each other intensely.

Problem, I recently started to have feelings for an other person (a friend). This person has had a boyfriend for now 3 years, and has absolutely no ideas. Those feelings were so strong, it was hard to hide them. After 2 months of me being distant and really sad, my girlfriend forced me to say what was bothering me. I'm not a good liar, and now she knows.

We decided to give us a week alone, I went to my parents. 

Now, I'm completely lost. I keep on thinking about this person, and I hate that. I would like for those feelings to go away, to leave me alone. But I can't.  

What do I do? It's been 3 months now, and I'm just desperate.

TL;DR:

Retrieved completion:
  long relationship; fell in love with an other person; admitted it; would like it to disappear, though it 

In [16]:
for i in range(3):
    print("="*80)
    print("Prompt:\n", X_val.iloc[i])
    print("\nRetrieved completion:\n", retrieve_completion(X_val.iloc[i], model, X_train, y_train))
    print("\nGround truth:\n", y_val.iloc[i])

Prompt:
 SUBREDDIT: r/AskReddit

TITLE: How do you get someone out of your head?

POST: Hi,
I'm 22, and I have been with my girlfriend for 5 years now. We recently moved together. We've always loved each other intensely.

Problem, I recently started to have feelings for an other person (a friend). This person has had a boyfriend for now 3 years, and has absolutely no ideas. Those feelings were so strong, it was hard to hide them. After 2 months of me being distant and really sad, my girlfriend forced me to say what was bothering me. I'm not a good liar, and now she knows.

We decided to give us a week alone, I went to my parents. 

Now, I'm completely lost. I keep on thinking about this person, and I hate that. I would like for those feelings to go away, to leave me alone. But I can't.  

What do I do? It's been 3 months now, and I'm just desperate.

TL;DR:

Retrieved completion:
  long relationship; fell in love with an other person; admitted it; would like it to disappear, though it 

In [17]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(model, "../models/relationship_model.pkl")
print("Saved pipeline as ../models/relationship_model.pkl")

Saved pipeline as ../models/relationship_model.pkl
