In [None]:
from datasets import load_dataset, load_from_disk

import json
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from minbpe import BasicTokenizer

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import  Dataset, DataLoader

from evaluate import load

## Read Data

In [None]:
ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

# Save to disk
# ds.save_to_disk("data/cnn_dailymail_dataset")

ds

In [4]:
# If you already save to disk
ds = load_from_disk("data/cnn_dailymail_dataset")
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [9]:
train_data = ds['train'].select_columns(["article", "highlights"])
val_data = ds['validation'].select_columns(["article", "highlights"])
test_data = ds['test'].select_columns(["article", "highlights"])
train_data, val_data, test_data

(Dataset({
     features: ['article', 'highlights'],
     num_rows: 287113
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 13368
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 11490
 }))

## Tokenize

In [14]:
all_articles_text = " ".join(train_data["article"][:1_000])
len(all_articles_text)

3530528

In [15]:
tokenizer = BasicTokenizer()
tokenizer.train(all_articles_text, vocab_size=1_024)

In [18]:
encoded_article_0 = tokenizer.encode(train_data['article'][0])
encoded_article_0[:5]

[76, 79, 78, 68, 79]

In [19]:
decoded_article_0 = tokenizer.decode(encoded_article_0)
decoded_article_0

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [20]:
decoded_article_0 == train_data['article'][0]

True

In [23]:
tokenizer.save("model/model_article_1000")

In [25]:
tokenizer.load("model/model_article_1000.model")