In [30]:
########################################
# SETTINGS
########################################

INPUT_FILE = "../books/data.jsonl"
OUTPUT_FILE = "tiny_stories_curated.txt"

MIN_WORDS = 30
MAX_WORDS = 300

########################################
# HELPERS
########################################

def extract_text(line):
    # Simple JSONL parser without json lib
    key = '"text":'
    if key not in line:
        return ""
    
    start = line.find(key) + len(key)
    text = line[start:].strip()
    
    # Remove quotes and trailing }
    if text.startswith('"'):
        text = text[1:]
    if text.endswith('"}'):
        text = text[:-2]
    if text.endswith('"'):
        text = text[:-1]
    
    return text


def clean_text(text):
    # Remove extra spaces
    text = " ".join(text.split())
    
    # Keep only basic ASCII
    cleaned = ""
    for c in text:
        if 32 <= ord(c) <= 126:
            cleaned += c
    
    return cleaned.strip()


def is_good_story(text):
    words = text.split()
    
    if len(words) < MIN_WORDS:
        return False
    
    if len(words) > MAX_WORDS:
        return False
    
    # Must contain punctuation
    if "." not in text and "!" not in text and "?" not in text:
        return False
    
    # Basic repetition check
    if len(set(words)) < 20:
        return False
    
    return True


########################################
# CURATION PIPELINE
########################################

seen = set()
good_stories = []

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        text = extract_text(line)
        text = clean_text(text)
        
        if not text:
            continue
        
        if not is_good_story(text):
            continue
        
        if text in seen:
            continue
        
        seen.add(text)
        good_stories.append(text)

########################################
# SAVE
########################################


with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for story in good_stories:
        story = (
            story.replace("\\n", " ")   # remove literal \n
                 .replace("\n", " ")    # remove real newline
                 .replace("\r", " ")
        )
        f.write(story + "\n")

print("Done!")
print("Curated stories:", len(good_stories))



Done!
Curated stories: 252


In [31]:
########################################
# SETTINGS
########################################

INPUT_FILE = "../books/data.jsonl"
OUTPUT_FILE = "tiny_stories_curated.txt"

MIN_WORDS = 30
MAX_WORDS = 300

########################################
# SPACY SETUP
########################################

import spacy
nlp = spacy.load("en_core_web_sm")

########################################
# HELPERS
########################################

def extract_text(line):
    # Simple JSONL parser without json lib
    key = '"text":'
    if key not in line:
        return ""
    
    start = line.find(key) + len(key)
    text = line[start:].strip()
    
    # Remove quotes and trailing }
    if text.startswith('"'):
        text = text[1:]
    if text.endswith('"}'):
        text = text[:-2]
    if text.endswith('"'):
        text = text[:-1]
    
    return text


def clean_text(text):
    # Remove extra spaces
    text = " ".join(text.split())
    
    # Keep only basic ASCII
    cleaned = ""
    for c in text:
        if 32 <= ord(c) <= 126:
            cleaned += c
    
    return cleaned.strip()


def is_good_story(text):
    words = text.split()
    
    if len(words) < MIN_WORDS:
        return False
    
    if len(words) > MAX_WORDS:
        return False
    
    # Must contain punctuation
    if "." not in text and "!" not in text and "?" not in text:
        return False
    
    # Basic repetition check
    if len(set(words)) < 20:
        return False
    
    return True


def redact_names(text):
    doc = nlp(text)
    redacted = text

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            redacted = redacted.replace(ent.text, "[NAME]")

    return redacted

########################################
# CURATION PIPELINE
########################################

seen = set()
good_stories = []

with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        text = extract_text(line)
        text = clean_text(text)
        
        if not text:
            continue
        
        if not is_good_story(text):
            continue
        
        if text in seen:
            continue
        
        seen.add(text)
        good_stories.append(text)

########################################
# SAVE
########################################

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for story in good_stories:
        story = redact_names(story)

        story = (
            story.replace("\\n", " ")
                 .replace("\n", " ")
                 .replace("\r", " ")
        )

        story = " ".join(story.split())
        
        f.write(story + "\n")

print("Done!")
print("Curated stories:", len(good_stories))


Done!
Curated stories: 252
