In [4]:
from datasets import load_dataset
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np




In [18]:

ds = load_dataset("SocialGrep/one-million-reddit-jokes")

In [85]:
ds


DatasetDict({
    train: Dataset({
        features: ['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw', 'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title', 'score'],
        num_rows: 1000000
    })
})

In [19]:
one_million_jokes = ds['train'].to_pandas()

In [87]:
one_million_jokes.columns

Index(['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw',
       'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title',
       'score'],
      dtype='object')

In [88]:
one_million_jokes.isna().sum()

type                   0
id                     0
subreddit.id           0
subreddit.name         0
subreddit.nsfw         0
created_utc            0
permalink              0
domain                 0
url               995528
selftext            4515
title                  0
score                  0
dtype: int64

In [20]:
one_million_jokes_clean = one_million_jokes[['title','selftext','score']]

selftexts_to_filter = ['[removed]', '[deleted]', r'\[removed\]', '\[Deleted\]']

mask = (
    one_million_jokes_clean['selftext'].isna() |
    one_million_jokes_clean['selftext'].isin(selftexts_to_filter) |
    one_million_jokes_clean['title'].isna() |
    one_million_jokes_clean['title'].isin(selftexts_to_filter)
)

one_million_jokes_clean = one_million_jokes_clean[~mask].reset_index(drop=True)

In [90]:
one_million_jokes_clean.shape

(573847, 3)

In [91]:
one_million_jokes_clean["selftext"].value_counts()

selftext
To get to the other side.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [92]:
one_million_jokes_clean["title"].value_counts()

title
Why did the chicken cross the road?                                                      584
Knock knock                                                                              329
A man walks into a bar                                                                   321
A man walks into a bar...                                                                272
Knock Knock                                                                              246
                                                                                        ... 
Two priests get pulled over on i95...                                                      1
Crushing cans...                                                                           1
4 Brazilian people were killed in an earthquake                                            1
On my first day in prison, my cellmate said to me                                          1
What did Arnold Schwarzenegger say when invited to the musician 

In [93]:
one_million_jokes_clean[one_million_jokes_clean["title"].str.contains("Ukrainian")]

Unnamed: 0,title,selftext,score
5716,A Ukrainian guy goes to the eye doctor.,The bottom line of the eye chart has the lette...,40
16607,If you turn Indonesia flag upside down you'll ...,An upside down Ukrainian flag,4
27819,I was going to make a joke about the Ukrainian...,But it would probably crash and burn.,0
38683,How much food does it take to kill a Ukrainian?,None.,3
43192,I keep getting that Justin Timberlake song abo...,"""Crimea Riverrrrrr""",8
...,...,...,...
545229,Where do sad Ukrainians go?,Crimea River.,1
546044,Why don't you buy Ukrainian underwear?,Because cher-nob'll fall out,6
551227,What did the Ukrainian say to the whiny American?,Crimea River.,8
557622,Why do Ukrainians not like being late?,They don't wanna be rushin,1


In [94]:
one_million_jokes_clean['score'].value_counts()

score
0        115847
1         55411
2         46104
3         36928
4         30348
          ...  
38131         1
36839         1
23637         1
7725          1
6598          1
Name: count, Length: 8554, dtype: int64

In [95]:
one_million_jokes_clean[one_million_jokes_clean["score"] > 80000]

Unnamed: 0,title,selftext,score
14850,Sad News: The founder of /r/jokes has passed away,"RIP Larry Tesler, the UI designer that created...",142733
59112,What did the reddit user say after detonating ...,EDIT: Wow! This blew up! Thanks for the gold!,93233
76351,If your surprised that Jeffrey Epstein commite...,Imagine how surprised he must have been.\n\nEd...,103652
205925,A new Navy recruit has his first day on the su...,"He speaks with the officer, who assigns him hi...",98257
278707,The only two white actors in Black Panther are...,They're the Tolkien white guys.\n\nEdit: Appar...,94939
286352,Everyone in Hawaii is mad about the malfunctio...,Hawaii **IS** the early warning system.,84346
296092,Ajit Pai.,That's it. That's the whole fucking joke.\n\nE...,94417
302304,If I had a $ for every post I've seen today ab...,I'd have enough money to view a post next year...,82834
302578,Calm down about the Net Neutrality thing...,Paying additional money to access certain site...,136359
341277,V,V\n\n*Edit: seems like the ctrl key on my keyb...,106412


In [99]:
one_million_jokes_clean[one_million_jokes_clean["title"].str.contains("ed]")]

Unnamed: 0,title,selftext,score
73012,[Actually happened] I went to the dentist to h...,At least it wasn't a penis enlargement.,2
73585,"TIL: The ""C"" in China stand for [censored].",Ha ha,3
84874,[actually happened] had a Puerto Rican Co-work...,"So we kept going back an forth, he was very co...",2
117643,I was choking on a piece of steak one night [t...,"While eating dinner with my family, I started ...",33
187259,My fiancee thought I was bringing a condom to ...,We have this miniature dresser in our bedroom ...,5
201039,What do the Twin Towers and number of genders ...,There were two of them that got most of the at...,3
201859,[Doctor Who themed] Why was Sylvester McCoy af...,Because McGann Hurt Eccleston,7
251892,[Modernized] Why do U.N. tanks have rear view ...,To see the village they were supposed to prote...,2
282755,Muslim converting to Christianity [Translated],A muslim decided to convert to Christianity. H...,11
285511,A ship goes out to sea and crashes [fixed],A ship goes out to sea and crashes. The surviv...,2


In [21]:
import re
import string
import pandas as pd

df = one_million_jokes_clean.copy()

def normalize_text(s):
    if pd.isna(s):
        return ""
    s = str(s).lower()
    # remove punctuation: . , ! ? ' " etc
    s = re.sub(f"[{re.escape(string.punctuation)}]", "", s)
    # collapse spaces and new lines
    s = re.sub(r"\s+", " ", s)
    s = s.strip()
    return s

df["title_norm"] = df["title"].map(normalize_text)
df["selftext_norm"] = df["selftext"].map(normalize_text)

# cluster key ignores punctuation differences
df["cluster_key"] = df["title_norm"] + " || " + df["selftext_norm"]

def keep_central_score_median(group):
    target = group["score"].median()
    idx = (group["score"] - target).abs().idxmin()
    return group.loc[[idx]]

one_million_jokes_stageA = (
    df
    .groupby("cluster_key", group_keys=False)
    .apply(keep_central_score_median)
    .reset_index(drop=True)
)

one_million_jokes_stageA = one_million_jokes_stageA.drop(
    columns=["title_norm", "selftext_norm", "cluster_key"]
)

original_rows = len(one_million_jokes_clean)
stageA_rows = len(one_million_jokes_stageA)
print(f"Original rows: {original_rows}")
print(f"After Stage A: {stageA_rows} (removed {original_rows - stageA_rows})")


Original rows: 573847
After Stage A: 540500 (removed 33347)


  df


In [22]:
one_million_jokes_stageA[one_million_jokes_stageA["title"].str.contains("Ukrainian")]

Unnamed: 0,title,selftext,score
4408,A balcony fell in the Ukrainian city and crush...,There’s a big crowd next to the corpse. Old wo...,5
41193,"A Mexican, a Texan and an Ukrainian sitting in...","Suddenly the Mexican gets up, pulls his guns, ...",16
49519,A Russian and a Ukrainian walk into a bakery.,The Ukrainian steals 3 buns and puts them into...,4
49654,"A Russian, Ukrainian and Armenian are riding t...","Armenian guy gets out some vodka, snacks, some...",14
54332,A Ukrainian a Russian and an American are sent...,The Devil meets them when they get there and h...,0
...,...,...,...
523907,Why shouldn't you wear Ukrainian Speedos?,Chernobyl fallout.,15
523909,Why shouldn't you wear Ukrainian underpants?,Coz Chernobyl fallout.,2
523910,Why shouldn't you wear Ukrainian underwear?,Because Chernobyl fallout.,3
523928,Why shouldn’t men wear Ukrainian underpants?,Because Chernobyl fallout.,5


In [26]:
one_million_jokes_stageA.to_csv('one_million_jokes_roughly_cleaned.csv', index=False)

In [31]:
df = pd.read_csv('one_million_jokes_roughly_cleaned.csv')

In [32]:
df.shape

(540532, 3)

In [33]:
def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).lower()
    s = re.sub(r"\s+", " ", s)   # collapse whitespace and newlines
    s = s.strip()
    return s

df["combined"] = (
    df["title"].map(normalize_text) + " " +
    df["selftext"].map(normalize_text)
)

In [34]:

embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(embed_model_name)   # will use CPU by default
# If you have a GPU and want to use it:
# embed_model = SentenceTransformer(embed_model_name, device="cuda")

def embed_texts(texts, batch_size=128):
    """
    texts: pandas Series or list of strings
    returns: numpy array with shape (len(texts), dim)
    """
    embs = embed_model.encode(
        list(texts),
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,   # dot product becomes cosine similarity
    )
    return embs.astype("float32")


In [35]:
df.shape

(540532, 4)

In [36]:
embeddings = embed_texts(df["combined"], batch_size=64)
print(embeddings.shape)  # should be (num_rows, 768)

Batches:   0%|          | 0/8446 [00:00<?, ?it/s]

(540532, 384)


In [37]:
import faiss

num_vectors, dim = embeddings.shape

index = faiss.IndexHNSWFlat(dim, 32)   # 32 neighbors in the graph
index.hnsw.efConstruction = 200
index.hnsw.efSearch = 64               # can increase for higher recall

print("Adding embeddings to index...")
index.add(embeddings)
print("Index size:", index.ntotal)


Adding embeddings to index...
Index size: 540532


In [38]:
k = 20                     # neighbours per point to inspect
similarity_threshold = 0.9  # tune this

# search all at once
print("Searching nearest neighbours...")
distances, indices = index.search(embeddings, k)   # distances are actually similarities

n = num_vectors
parent = list(range(n))
rank = [0] * n

def find(x: int) -> int:
    while parent[x] != x:
        parent[x] = parent[parent[x]]
        x = parent[x]
    return x

def union(a: int, b: int) -> None:
    ra, rb = find(a), find(b)
    if ra == rb:
        return
    if rank[ra] < rank[rb]:
        parent[ra] = rb
    elif rank[ra] > rank[rb]:
        parent[rb] = ra
    else:
        parent[rb] = ra
        rank[ra] += 1

from tqdm.auto import trange

print("Linking near-duplicate jokes...")
for i in trange(n):
    sims = distances[i]
    neighs = indices[i]

    for sim, j in zip(sims[1:], neighs[1:]):   # skip self at position 0
        if j < 0:
            continue
        if sim < similarity_threshold:
            # results are sorted by similarity, so we can break
            break
        union(i, int(j))

# build clusters from union–find
clusters = {}
for i in range(n):
    root = find(i)
    clusters.setdefault(root, []).append(i)

cluster_sizes = {root: len(inds) for root, inds in clusters.items()}
num_clusters = len(clusters)
num_multi = sum(1 for s in cluster_sizes.values() if s > 1)

print(f"Total clusters: {num_clusters}")
print(f"Clusters with size > 1: {num_multi}")


Searching nearest neighbours...
Linking near-duplicate jokes...


  0%|          | 0/540532 [00:00<?, ?it/s]

Total clusters: 265679
Clusters with size > 1: 3


In [39]:
scores_all = df["score"].to_numpy(dtype=float)
keep_mask = np.ones(n, dtype=bool)

print("Selecting median-score representative per cluster...")
for root, inds in tqdm(clusters.items()):
    if len(inds) == 1:
        continue

    inds_arr = np.array(inds, dtype=int)
    scores = scores_all[inds_arr]
    median = np.median(scores)
    distances_to_median = np.abs(scores - median)
    best_rel = distances_to_median.argmin()
    chosen = inds_arr[best_rel]

    # mark all as dropped, then keep the chosen one
    keep_mask[inds_arr] = False
    keep_mask[chosen] = True

one_million_jokes_stageC = df[keep_mask].reset_index(drop=True)

print(f"Rows before Stage C: {len(df)}")
print(f"Rows after Stage C:  {len(one_million_jokes_stageC)}")
print(f"Removed in Stage C:  {len(df) - len(one_million_jokes_stageC)}")


Selecting median-score representative per cluster...


  0%|          | 0/265679 [00:00<?, ?it/s]

Rows before Stage C: 540532
Rows after Stage C:  265679
Removed in Stage C:  274853


In [40]:
import random

# list of cluster roots with at least two members
multi_roots = [root for root, size in cluster_sizes.items() if size > 1]

def show_random_clusters(num_clusters_to_show=5, max_items_per_cluster=10):
    chosen_roots = random.sample(
        multi_roots,
        min(num_clusters_to_show, len(multi_roots))
    )
    for root in chosen_roots:
        inds = clusters[root]
        print("=" * 100)
        print(f"Cluster root {root} | size {len(inds)}")
        print("-" * 100)
        for idx in inds[:max_items_per_cluster]:
            row = df.iloc[idx]
            print(f"[idx {idx}] score={row['score']}")
            print("TITLE:   ", row["title"])
            print("SELFTEXT:", row["selftext"])
            print()

# example: inspect 5 random clusters
show_random_clusters(num_clusters_to_show=5, max_items_per_cluster=8)


Cluster root 28537 | size 274816
----------------------------------------------------------------------------------------------------
[idx 0] score=0.0
TITLE:    -.. .. -.. + -.-- --- ..- + ... . . + - .... . + .. -. - . .-. -. . - + .--- --- -.- . + .. -. + -- --- .-. ... . + -.-. --- -.. .
SELFTEXT: ..  -  +  .--  .-  ...  +  -..  ---  -  +  -.-.  ---  --  .  -..  -.-- 

[idx 1] score=0.0
TITLE:    ?
SELFTEXT: 2019 is fresh in my mind

[idx 2] score=2.0
TITLE:    #
SELFTEXT: 2537

[idx 6] score=249.0
TITLE:    ,,,,,
SELFTEXT: Chameleon

[idx 9] score=2.0
TITLE:    ...
SELFTEXT: I got spot remover on my dog, now hes gone.

[idx 10] score=1.0
TITLE:    .
SELFTEXT: I joined this sub because legal reasons

[idx 11] score=1.0
TITLE:    ...
SELFTEXT: I just needed to make a few points. 

[idx 14] score=2.0
TITLE:    :)
SELFTEXT: I'm considering attending music lessons, it's sound school.

Cluster root 492011 | size 20
------------------------------------------------------------------------

In [41]:
one_million_jokes_stageC.to_csv('one_million_jokes_cleaned_llm_poc.csv', index=False)

In [16]:
one_million_jokes_stageC[one_million_jokes_stageC["title"].str.contains("Ukrainian")]

Unnamed: 0,title,selftext,score,combined
2056,(British) Why should you never buy Ukrainian u...,Chernobyl fallout...,0.0,(british) why should you never buy ukrainian u...
2489,(Ukrainian joke). Leaders of three countries d...,US President says:” we tax our people at rough...,19.0,(ukrainian joke). leaders of three countries d...
40319,A Russian and a Ukrainian walk into a bakery.,The Ukrainian steals 3 buns and puts them into...,4.0,a russian and a ukrainian walk into a bakery. ...
43434,A Ukrainian guy goes to the eye doctor.,The bottom line of the eye chart has the lette...,40.0,a ukrainian guy goes to the eye doctor. the bo...
43435,A Ukrainian man marries a Russian women ...,"Following the ceremony, at the man's small hou...",1.0,a ukrainian man marries a russian women ... fo...
43436,A Ukrainian woman lost her kids,"So she asked her husband, ""Honey, where's the ...",0.0,a ukrainian woman lost her kids so she asked h...
43437,"A Ukrainian, American, and Polish guy work on ...",Every day they take lunch on top of the buildi...,6.0,"a ukrainian, american, and polish guy work on ..."
49488,An American &amp; Ukrainian at a public loo.,The American pulls out his massive cock and de...,25.0,an american &amp; ukrainian at a public loo. t...
49529,"An American couple, an English couple and a Uk...","The american husband says ""could you pass me t...",49.0,"an american couple, an english couple and a uk..."
53098,An old Ukrainian is cleaning his hunting rifle...,"""Grandfather, the radio says that the Russians...",29465.0,an old ukrainian is cleaning his hunting rifle...


In [42]:
df = pd.read_csv('one_million_jokes_cleaned_llm_poc.csv')

In [43]:
df.shape

(265679, 4)