In [59]:
from datasets import load_dataset
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np

In [78]:
#dad_jokes_full = ds['train'].to_pandas() + ds['test'].to_pandas()
dad_jokes_full = pd.read_csv('reddit_dadjokes.csv')

In [79]:
dad_jokes_full.shape

(216328, 5)

In [80]:
dad_jokes_full.columns

Index(['author', 'url', 'joke', 'score', 'date'], dtype='object')

In [81]:
dad_jokes_full = dad_jokes_full[['joke', 'score']].copy()
dad_jokes_full = dad_jokes_full[dad_jokes_full['score'] >= 5]

In [82]:
dad_jokes_full.shape

(60113, 2)

In [64]:
dad_jokes_full.isna().sum()

joke     0
score    0
dtype: int64

In [65]:
import re

QUESTION_START_WORDS = {
    "what", "why", "how", "when", "where", "who", "which",
    "did", "do", "does", "is", "are", "can", "have", "has", "had",
    "will", "would", "could", "should"
}

def _strip_leading_punct(t: str) -> str:
    # For question detection
    return re.sub(r'^[\s"‚Äú‚Äù\'‚Äò‚Äô(\[\]\-‚Äì‚Äî*‚Ä¢]+', '', t)

def _clean_piece(t: str) -> str:
    """Cleanup for returned question / punchline."""
    if not isinstance(t, str):
        return t
    t = t.strip()
    # strip leading quotes / spaces
    t = re.sub(r'^[\s"‚Äú‚Äù\'‚Äò‚Äô]+', '', t)
    # strip trailing quotes / spaces
    t = re.sub(r'["‚Äú‚Äù\'‚Äò‚Äô]+\s*$', '', t)
    return t.strip()

def looks_like_question(text: str) -> bool:
    if not isinstance(text, str):
        return False
    t = text.strip()
    if not t:
        return False
    t = _strip_leading_punct(t)
    if not t:
        return False
    if t.endswith("?"):
        return True
    first_word_match = re.match(r"([^\s:]+)", t.lower())
    if not first_word_match:
        return False
    first = first_word_match.group(1)
    first = first.strip(':"‚Äú‚Äù\'‚Äò‚Äô()[]{}.,;!-‚Äì‚Äî')
    return first in QUESTION_START_WORDS


def split_joke(text: str):
    """
    Return (setup_or_question, punchline, rule_used)
    or (None, None, 'no_split').
    """
    if not isinstance(text, str):
        return None, None, "not_string"

    s = text.strip()
    if not s:
        return None, None, "empty"

    # Normalise escaped newlines and real newlines
    s = s.replace("\\r\\n", "\n").replace("\\n", "\n")
    s = s.replace("\r\n", "\n").replace("\r", "\n")

    # --- RULE 0: ultra-simple question-mark split ---
    qm_idx = s.find("?")
    if qm_idx != -1:
        left = s[:qm_idx + 1]
        right = s[qm_idx + 1:]
        if right.strip():
            return _clean_piece(left), _clean_piece(right), "simple_qmark"

    # --- NEW RULE: URL Punchline ---
    # Splits on a URL, which is often the punchline
    # Use re.DOTALL so . matches newlines
    url_match = re.search(r"(.+)(\s+|\\n)(https?://[^\s]+)\s*$", s, re.DOTALL)
    if url_match:
        setup = url_match.group(1).strip()
        punchline = url_match.group(3).strip()
        if setup: # Ensure there is some setup text
            return _clean_piece(setup), _clean_piece(punchline), "url_punchline"

    # --- MODIFIED RULE: Ellipsis split (now catches '..') ---
    # Splits on '..' or '...' or '‚Ä¶'
    parts = re.split(r"\s*\.{2,}\s*|\s*‚Ä¶\s*", s, maxsplit=1)
    if len(parts) == 2 and parts[0].strip() and parts[1].strip():
        right = re.sub(r"^\s*(\.{2,}|‚Ä¶)\s*", "", parts[1])
        return _clean_piece(parts[0]), _clean_piece(right.strip()), "ellipsis_split"

    # --- MODIFIED RULE: blank line (removed 'looks_like_question') ---
    # A blank line is a strong enough separator
    parts = [p.strip() for p in re.split(r"\n\s*\n", s) if p.strip()]
    if len(parts) >= 2:
        left = parts[0]
        right = " ".join(parts[1:])
        return _clean_piece(left), _clean_piece(right), "blank_line"

    # --- Rule 2: single newline, first line looks like a question
    # (This rule is good, no change)
    lines = [ln.strip() for ln in s.split("\n") if ln.strip()]
    if len(lines) >= 2 and looks_like_question(lines[0]):
        left = lines[0]
        right = " ".join(lines[1:])
        return _clean_piece(left), _clean_piece(right), "single_newline"
    
    # --- Rule 3: Dialogue "Kid: ... me: ..." ---
    # (No change)
    dialog_match = re.search(
        r'\b[Kk]id:\s*(.+?)\s+\b[mM]e:\s*(.+)',
        s,
        re.DOTALL
    )
    if dialog_match:
        left = f"Kid: {dialog_match.group(1).strip()}"
        right = f"me: {dialog_match.group(2).strip()}"
        return _clean_piece(left), _clean_piece(right), "kid_me_dialogue"

    # --- MODIFIED RULE: Last sentence (relaxed logic) ---
    # Split into sentences on . ! ? ... followed by whitespace
    sentences = re.split(r"(?<=[\.\!\?‚Ä¶])\s+", s)
    sentences = [sent.strip() for sent in sentences if sent.strip()]

    if len(sentences) >= 2:
        setup = " ".join(sentences[:-1]).strip()
        punchline = sentences[-1].strip()
        
        setup_words = setup.split()
        punchline_words = punchline.split()

        # Relaxed logic:
        # - Punchline is reasonably short (<= 15 words)
        # - Setup is at least 3 words
        # - Punchline isn't *dramatically* longer than the setup
        if (len(punchline_words) <= 15 and len(setup_words) >= 3 and
            len(punchline_words) <= (len(setup_words) + 5)):
            return _clean_piece(setup), _clean_piece(punchline), "last_sentence"

    # --- Rule 5: Capitalized Punchline (fallback) ---
    # (No change)
    match = re.search(r"(.+)\s+([A-Z'‚Äô\"‚Äú].*)$", s)
    if match:
        setup = match.group(1).strip()
        punchline = match.group(2).strip()

        setup_words = setup.split()
        punchline_words = punchline.split()

        if len(punchline_words) <= 10 and len(setup_words) >= 3:
            if looks_like_question(setup) or len(punchline_words) <= 8:
                 return _clean_piece(setup), _clean_piece(punchline), "capital_punchline"

    # --- NEW RULE: Keyword Split (for lowercase run-ons) ---
    # Catches "why can't... because it's..."
    # and "why do you... incase you get..."
    keywords = ['because', 'incase', 'apparently']
    for kw in keywords:
        # Regex: (anything) (space) (keyword) (space) (anything)
        match = re.search(f"(.+)\\s+({kw})\\s+(.+)$", s, re.IGNORECASE)
        if match:
            setup = match.group(1).strip()
            # Re-build the punchline with the keyword
            punchline = f"{match.group(2)} {match.group(3)}".strip()
            
            # Fire if setup looks like a question OR is longer
            if looks_like_question(setup) or (len(setup.split()) > len(punchline.split())):
                return _clean_piece(setup), _clean_piece(punchline), "keyword_split"

    return None, None, "no_split"

In [66]:
splits = dad_jokes_full["joke"].map(split_joke)
splits_df = pd.DataFrame(
    splits.tolist(), 
    columns=["question", "response", "split_rule"],
    index=dad_jokes_full.index  # <-- ADD THIS LINE
)


In [67]:
splits_df

Unnamed: 0,question,response,split_rule
3182,Nick Cannon bought Mariah Carrey an undevelope...,"She told him ""I don't want a lot for Christmas"".",last_sentence
4298,What do you call a cop in a bed?,(Lord help me) Pig in a blanket üòÑ‚òùüò≠üê∑,simple_qmark
4299,What item of clothing does an asthmatic person...,Pants.,simple_qmark
4300,"When wearing a bikini, women reveal 90 % of th...",Men are so polite they only look at the covere...,last_sentence
4305,What do you call sailors who curse a lot?,Pirates of the Swearibbean,simple_qmark
...,...,...,...
216306,More clever than usual. My dad and I were watc...,"Him, ""Hakeem""\n\nMe, ""Hakkem who?""\n\nand righ...",simple_qmark
216321,My daughter doesn't wear polka dot dresses any...,"because every time she does, I poke all the do...",ellipsis_split
216322,"Dads, remember NYE protocol. 1. Wait till 11:5...","If we stick to this, I'm sure we can have a gr...",blank_line
216323,Farewell Dad Jokes,See you next year!,ellipsis_split


In [68]:
splits_df[splits_df['split_rule'] == 'no_split']

Unnamed: 0,question,response,split_rule
4409,,,no_split
4492,,,no_split
4496,,,no_split
14230,,,no_split
14277,,,no_split
...,...,...,...
215842,,,no_split
215887,,,no_split
215977,,,no_split
215989,,,no_split


In [69]:
dad_jokes_splitted = pd.concat([dad_jokes_full, splits_df], axis=1)
dad_jokes_splitted

Unnamed: 0,joke,score,question,response,split_rule
3182,Nick Cannon bought Mariah Carrey an undevelope...,5,Nick Cannon bought Mariah Carrey an undevelope...,"She told him ""I don't want a lot for Christmas"".",last_sentence
4298,What do you call a cop in a bed? (Lord help me...,15,What do you call a cop in a bed?,(Lord help me) Pig in a blanket üòÑ‚òùüò≠üê∑,simple_qmark
4299,What item of clothing does an asthmatic person...,7,What item of clothing does an asthmatic person...,Pants.,simple_qmark
4300,"When wearing a bikini, women reveal 90 % of th...",16,"When wearing a bikini, women reveal 90 % of th...",Men are so polite they only look at the covere...,last_sentence
4305,What do you call sailors who curse a lot? Pira...,5,What do you call sailors who curse a lot?,Pirates of the Swearibbean,simple_qmark
...,...,...,...,...,...
216306,More clever than usual. My dad and I were watc...,65,More clever than usual. My dad and I were watc...,"Him, ""Hakeem""\n\nMe, ""Hakkem who?""\n\nand righ...",simple_qmark
216321,My daughter doesn't wear polka dot dresses any...,8,My daughter doesn't wear polka dot dresses any...,"because every time she does, I poke all the do...",ellipsis_split
216322,"Dads, remember NYE protocol. 1. Wait till 11:5...",7,"Dads, remember NYE protocol. 1. Wait till 11:5...","If we stick to this, I'm sure we can have a gr...",blank_line
216323,Farewell Dad Jokes... See you next year!,34,Farewell Dad Jokes,See you next year!,ellipsis_split


In [71]:
dad_jokes_splitted[dad_jokes_splitted['split_rule'] == 'no_split'].to_csv('not_splitted.csv', index=False)

In [72]:
dad_jokes_splitted[dad_jokes_splitted['split_rule'] != 'no_split'].to_csv('splitted.csv', index=False)

In [74]:
dad_jokes_splitted_final = dad_jokes_splitted[dad_jokes_splitted['split_rule'] != 'no_split'].copy()

In [75]:
dad_jokes_splitted_final = dad_jokes_splitted_final[['question', 'response', 'score']]
dad_jokes_splitted_final

Unnamed: 0,question,response,score
3182,Nick Cannon bought Mariah Carrey an undevelope...,"She told him ""I don't want a lot for Christmas"".",5
4298,What do you call a cop in a bed?,(Lord help me) Pig in a blanket üòÑ‚òùüò≠üê∑,15
4299,What item of clothing does an asthmatic person...,Pants.,7
4300,"When wearing a bikini, women reveal 90 % of th...",Men are so polite they only look at the covere...,16
4305,What do you call sailors who curse a lot?,Pirates of the Swearibbean,5
...,...,...,...
216306,More clever than usual. My dad and I were watc...,"Him, ""Hakeem""\n\nMe, ""Hakkem who?""\n\nand righ...",65
216321,My daughter doesn't wear polka dot dresses any...,"because every time she does, I poke all the do...",8
216322,"Dads, remember NYE protocol. 1. Wait till 11:5...","If we stick to this, I'm sure we can have a gr...",7
216323,Farewell Dad Jokes,See you next year!,34


In [76]:
dad_jokes_splitted_final.to_csv('dad_jokes_splitted_final.csv', index=False)