In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from datasets import load_dataset


stories = load_dataset("roneneldan/TinyStories")


In [7]:
raw_texts = stories['train']['text'][:50000]

In [8]:
import re
import json
def clean_text(txt):
    txt = re.sub(r"[^a-zA-Z0-9.,!?'\s]", "", txt)
    txt = re.sub(r"\s+", " ", txt)
    return txt.strip().lower()

In [9]:
texts = [clean_text(text) for text in raw_texts]

texts[10]

'once upon a time, there was a big car named dependable. he had a very important job. dependable would take a family to the park every day. the family had a mom, dad, and a little girl named lily. they all had a lot of love for each other. one day, when they got to the park, they saw a big sign that said, fun race today! the family was very excited. they knew that dependable was very fast and could win the race. so, they decided to join the race. the race started, and dependable went very fast. the other cars tried to catch up, but dependable was too quick. in the end, dependable won the race! the family was so happy and proud of their car. they knew that their love for each other and their trust in dependable made them win the race. and from that day on, they had even more fun at the park, knowing that they had the fastest and most dependable car around.'

In [10]:
class Tokenizer:
    def __init__(self, text):
        words = set()
        for text in texts:
            words.update(text.split())

        self.vocab = {word: i for i, word in enumerate(sorted(words), start=0)}
        self.inv_vocab = {i: word for word, i in self.vocab.items()}

    def encode(self, text):
        return [self.vocab[word] for word in text.split() if word in self.vocab]

    def decode(self, ids):
        return " ".join([self.inv_vocab[i] for i in ids])

In [11]:
tokenizer = Tokenizer(texts)

input_target_pairs = []

for text in texts:
    token_ids = tokenizer.encode(text)
    if len(token_ids) < 2:
        continue
    for i in range(1, len(token_ids)):
        input_seq = token_ids[:i]
        target_seq = token_ids[1:i+1]
        input_target_pairs.append((input_seq, target_seq))

In [12]:
with open('tokenizer_vocab.json', 'w') as f:
    json.dump(tokenizer.vocab, f)

with open('stories_pairs.json', 'w') as  f:
    json.dump(input_target_pairs[:1000], f)

In [32]:
encoded = tokenizer.encode("one day a little car")
print(encoded)

[22222, 8405, 470, 18638, 5075]


In [33]:
decoded = tokenizer.decode(encoded)
print(decoded)

one day a little car
