In [3]:
from datasets import load_dataset
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np




In [4]:

ds = load_dataset("SocialGrep/one-million-reddit-jokes")

In [5]:
ds


DatasetDict({
    train: Dataset({
        features: ['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw', 'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title', 'score'],
        num_rows: 1000000
    })
})

In [6]:
one_million_jokes = ds['train'].to_pandas()

In [7]:
one_million_jokes.columns

Index(['type', 'id', 'subreddit.id', 'subreddit.name', 'subreddit.nsfw',
       'created_utc', 'permalink', 'domain', 'url', 'selftext', 'title',
       'score'],
      dtype='object')

In [8]:
one_million_jokes.isna().sum()

type                   0
id                     0
subreddit.id           0
subreddit.name         0
subreddit.nsfw         0
created_utc            0
permalink              0
domain                 0
url               995528
selftext            4515
title                  0
score                  0
dtype: int64

In [9]:
one_million_jokes_clean = one_million_jokes[['title','selftext','score']]

selftexts_to_filter = ['[removed]', '[deleted]', r'\[removed\]', '\[Deleted\]']

mask = (
    one_million_jokes_clean['selftext'].isna() |
    one_million_jokes_clean['selftext'].isin(selftexts_to_filter) |
    one_million_jokes_clean['title'].isna() |
    one_million_jokes_clean['title'].isin(selftexts_to_filter)
)

one_million_jokes_clean = one_million_jokes_clean[~mask].reset_index(drop=True)

In [10]:
one_million_jokes_clean.shape

(573847, 3)

In [11]:
one_million_jokes_clean["selftext"].value_counts()

selftext
To get to the other side.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [12]:
one_million_jokes_clean["title"].value_counts()

title
Why did the chicken cross the road?                                                      584
Knock knock                                                                              329
A man walks into a bar                                                                   321
A man walks into a bar...                                                                272
Knock Knock                                                                              246
                                                                                        ... 
Two priests get pulled over on i95...                                                      1
Crushing cans...                                                                           1
4 Brazilian people were killed in an earthquake                                            1
On my first day in prison, my cellmate said to me                                          1
What did Arnold Schwarzenegger say when invited to the musician 

In [13]:
one_million_jokes_clean[one_million_jokes_clean["title"].str.contains("Ukrainian")]

Unnamed: 0,title,selftext,score
5716,A Ukrainian guy goes to the eye doctor.,The bottom line of the eye chart has the lette...,40
16607,If you turn Indonesia flag upside down you'll ...,An upside down Ukrainian flag,4
27819,I was going to make a joke about the Ukrainian...,But it would probably crash and burn.,0
38683,How much food does it take to kill a Ukrainian?,None.,3
43192,I keep getting that Justin Timberlake song abo...,"""Crimea Riverrrrrr""",8
...,...,...,...
545229,Where do sad Ukrainians go?,Crimea River.,1
546044,Why don't you buy Ukrainian underwear?,Because cher-nob'll fall out,6
551227,What did the Ukrainian say to the whiny American?,Crimea River.,8
557622,Why do Ukrainians not like being late?,They don't wanna be rushin,1


In [None]:
one_million_jokes_clean['score'].value_counts()

In [None]:
one_million_jokes_clean[one_million_jokes_clean["score"] > 80000]

In [14]:
one_million_jokes_clean[one_million_jokes_clean["title"].str.contains("ed]")]

Unnamed: 0,title,selftext,score
73012,[Actually happened] I went to the dentist to h...,At least it wasn't a penis enlargement.,2
73585,"TIL: The ""C"" in China stand for [censored].",Ha ha,3
84874,[actually happened] had a Puerto Rican Co-work...,"So we kept going back an forth, he was very co...",2
117643,I was choking on a piece of steak one night [t...,"While eating dinner with my family, I started ...",33
187259,My fiancee thought I was bringing a condom to ...,We have this miniature dresser in our bedroom ...,5
201039,What do the Twin Towers and number of genders ...,There were two of them that got most of the at...,3
201859,[Doctor Who themed] Why was Sylvester McCoy af...,Because McGann Hurt Eccleston,7
251892,[Modernized] Why do U.N. tanks have rear view ...,To see the village they were supposed to prote...,2
282755,Muslim converting to Christianity [Translated],A muslim decided to convert to Christianity. H...,11
285511,A ship goes out to sea and crashes [fixed],A ship goes out to sea and crashes. The surviv...,2


In [15]:
import re
import string
import pandas as pd

df = one_million_jokes_clean.copy()

def normalize_text(s):
    if pd.isna(s):
        return ""
    s = str(s).lower()
    # remove punctuation: . , ! ? ' " etc
    s = re.sub(f"[{re.escape(string.punctuation)}]", "", s)
    # collapse spaces and new lines
    s = re.sub(r"\s+", " ", s)
    s = s.strip()
    return s

df["title_norm"] = df["title"].map(normalize_text)
df["selftext_norm"] = df["selftext"].map(normalize_text)

# cluster key ignores punctuation differences
df["cluster_key"] = df["title_norm"] + " || " + df["selftext_norm"]

def keep_central_score_median(group):
    target = group["score"].median()
    idx = (group["score"] - target).abs().idxmin()
    return group.loc[[idx]]

one_million_jokes_stageA = (
    df
    .groupby("cluster_key", group_keys=False)
    .apply(keep_central_score_median)
    .reset_index(drop=True)
)

one_million_jokes_stageA = one_million_jokes_stageA.drop(
    columns=["title_norm", "selftext_norm", "cluster_key"]
)

original_rows = len(one_million_jokes_clean)
stageA_rows = len(one_million_jokes_stageA)
print(f"Original rows: {original_rows}")
print(f"After Stage A: {stageA_rows} (removed {original_rows - stageA_rows})")


Original rows: 573847
After Stage A: 540500 (removed 33347)


  df


In [None]:
one_million_jokes_stageA[one_million_jokes_stageA["title"].str.contains("Ukrainian")]

In [None]:
one_million_jokes_stageA.to_csv('one_million_jokes_roughly_cleaned.csv', index=False)

In [None]:
df = pd.read_csv('one_million_jokes_roughly_cleaned.csv')

In [None]:
df.shape

In [None]:
def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).lower()
    s = re.sub(r"\s+", " ", s)   # collapse whitespace and newlines
    s = s.strip()
    return s

df["combined"] = (
    df["title"].map(normalize_text) + " " +
    df["selftext"].map(normalize_text)
)

In [None]:

embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(embed_model_name)   # will use CPU by default
# If you have a GPU and want to use it:
# embed_model = SentenceTransformer(embed_model_name, device="cuda")

def embed_texts(texts, batch_size=128):
    """
    texts: pandas Series or list of strings
    returns: numpy array with shape (len(texts), dim)
    """
    embs = embed_model.encode(
        list(texts),
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,   # dot product becomes cosine similarity
    )
    return embs.astype("float32")


In [None]:
df.shape

In [None]:
embeddings = embed_texts(df["combined"], batch_size=64)
print(embeddings.shape)  # should be (num_rows, 768)

In [None]:
import faiss

num_vectors, dim = embeddings.shape

index = faiss.IndexHNSWFlat(dim, 32)   # 32 neighbors in the graph
index.hnsw.efConstruction = 200
index.hnsw.efSearch = 64               # can increase for higher recall

print("Adding embeddings to index...")
index.add(embeddings)
print("Index size:", index.ntotal)


In [None]:
k = 20                     # neighbours per point to inspect
similarity_threshold = 0.9  # tune this

# search all at once
print("Searching nearest neighbours...")
distances, indices = index.search(embeddings, k)   # distances are actually similarities

n = num_vectors
parent = list(range(n))
rank = [0] * n

def find(x: int) -> int:
    while parent[x] != x:
        parent[x] = parent[parent[x]]
        x = parent[x]
    return x

def union(a: int, b: int) -> None:
    ra, rb = find(a), find(b)
    if ra == rb:
        return
    if rank[ra] < rank[rb]:
        parent[ra] = rb
    elif rank[ra] > rank[rb]:
        parent[rb] = ra
    else:
        parent[rb] = ra
        rank[ra] += 1

from tqdm.auto import trange

print("Linking near-duplicate jokes...")
for i in trange(n):
    sims = distances[i]
    neighs = indices[i]

    for sim, j in zip(sims[1:], neighs[1:]):   # skip self at position 0
        if j < 0:
            continue
        if sim < similarity_threshold:
            # results are sorted by similarity, so we can break
            break
        union(i, int(j))

# build clusters from union‚Äìfind
clusters = {}
for i in range(n):
    root = find(i)
    clusters.setdefault(root, []).append(i)

cluster_sizes = {root: len(inds) for root, inds in clusters.items()}
num_clusters = len(clusters)
num_multi = sum(1 for s in cluster_sizes.values() if s > 1)

print(f"Total clusters: {num_clusters}")
print(f"Clusters with size > 1: {num_multi}")


In [None]:
scores_all = df["score"].to_numpy(dtype=float)
keep_mask = np.ones(n, dtype=bool)

print("Selecting median-score representative per cluster...")
for root, inds in tqdm(clusters.items()):
    if len(inds) == 1:
        continue

    inds_arr = np.array(inds, dtype=int)
    scores = scores_all[inds_arr]
    median = np.median(scores)
    distances_to_median = np.abs(scores - median)
    best_rel = distances_to_median.argmin()
    chosen = inds_arr[best_rel]

    # mark all as dropped, then keep the chosen one
    keep_mask[inds_arr] = False
    keep_mask[chosen] = True

one_million_jokes_stageC = df[keep_mask].reset_index(drop=True)

print(f"Rows before Stage C: {len(df)}")
print(f"Rows after Stage C:  {len(one_million_jokes_stageC)}")
print(f"Removed in Stage C:  {len(df) - len(one_million_jokes_stageC)}")


In [None]:
import random

# list of cluster roots with at least two members
multi_roots = [root for root, size in cluster_sizes.items() if size > 1]

def show_random_clusters(num_clusters_to_show=5, max_items_per_cluster=10):
    chosen_roots = random.sample(
        multi_roots,
        min(num_clusters_to_show, len(multi_roots))
    )
    for root in chosen_roots:
        inds = clusters[root]
        print("=" * 100)
        print(f"Cluster root {root} | size {len(inds)}")
        print("-" * 100)
        for idx in inds[:max_items_per_cluster]:
            row = df.iloc[idx]
            print(f"[idx {idx}] score={row['score']}")
            print("TITLE:   ", row["title"])
            print("SELFTEXT:", row["selftext"])
            print()

show_random_clusters(num_clusters_to_show=5, max_items_per_cluster=8)


In [None]:
one_million_jokes_stageC.to_csv('one_million_jokes_cleaned_llm_poc.csv', index=False)

In [None]:
one_million_jokes_stageC[one_million_jokes_stageC["title"].str.contains("Ukrainian")]

In [19]:
df = pd.read_csv('one_million_jokes_cleaned_llm_poc.csv')

In [20]:
df

Unnamed: 0,title,selftext,score,combined
0,-.. .. -.. + -.-- --- ..- + ... . . + - .... ....,.. - + .-- .- ... + -.. --- - + -.-...,0.0,-.. .. -.. + -.-- --- ..- + ... . . + - .... ....
1,...,A man went into a library and asked for a book...,42.0,... a man went into a library and asked for a ...
2,:),\n\nA poor man meets a rich man around Christ...,2.0,:) a poor man meets a rich man around christma...
3,.,An Irishman was walking home when he saw a sig...,4.0,. an irishman was walking home when he saw a s...
4,:),does my thai girlfriend have a penis?\n\nsomet...,11.0,:) does my thai girlfriend have a penis? somet...
...,...,...,...,...
265674,üá®üá¶My wife came home from church 2 hours late t...,I‚Äôm sure it wasn‚Äôt I replied!priests only like...,0.0,üá®üá¶my wife came home from church 2 hours late t...
265675,üë®üèøüë®üèøüë®üèø,What do you call three black guys hanging from...,0.0,üë®üèøüë®üèøüë®üèø what do you call three black guys hangi...
265676,üòÇ,"""I asked my North Korean friend how it was the...",0.0,"üòÇ ""i asked my north korean friend how it was t..."
265677,üòÇüòâüòéüòç,"I was told ""you are what you eat""\n\n\n\nDoes ...",0.0,"üòÇüòâüòéüòç i was told ""you are what you eat"" does th..."


In [22]:
df['title'].value_counts()

title
Why did the chicken cross the road?                                                                    383
A man walks into a bar                                                                                 241
A man walks into a bar...                                                                              207
Knock knock                                                                                            192
Knock Knock                                                                                            149
                                                                                                      ... 
I bought some fruit at a farmers market. The guy said satisfaction guaranteed or you can return it.      1
I bought some green apples about a week ago                                                              1
I bought some headache tablets for my wife.                                                              1
I bought some lamb today labell

In [None]:
if df['titiel'] = []