In [None]:
import os
import re
import json
import requests
from pathlib import Path
from urllib.parse import urlparse

In [None]:
with open("./data/sources.json", "r") as f:
    sources = json.load(f)["sources"]


In [None]:
def check(s, ignore):
    r = []
    for a in ignore.keys():
        if not hasattr(str, a):
            continue

        fn = getattr(str, a)
        if type(ignore[a]) == str:
            r += [fn(s, ignore[a])]

        elif type(ignore[a]) == list:
            r += [fn(s, i) for i in ignore[a]]

        elif type(ignore[a]) == bool and ignore[a]:
            r += [fn(s)]

    return any(r)


In [None]:
def substitue(s, replace):
    for a in replace.keys():
        s = re.sub(a, replace[a], s)
    return s


In [None]:
def load(
    title="",
    source="",
    start=0,
    end=100,
    ignore={},
    replace={},
    cache=".cache",
    force=False,
):
    print(f"Loading {title}")
    # get filename
    a = urlparse(source)
    file = os.path.basename(a.path)

    # check cache
    c = Path(cache).absolute().resolve()
    if not c.exists():
        os.makedirs(str(c))
    cfile = c.joinpath(file)
    if force or not cfile.exists():
        response = requests.get(source)
        with open(str(cfile), "wt", encoding="utf-8") as f:
            f.write(response.text)

    # load text
    with open(str(cfile), "r", encoding="utf-8") as f:
        text = f.read()

    lines = text.encode("ascii", errors="ignore").decode("ascii").split("\n")[start:end]

    # cleaned sentences
    sentences = [
        f"{s.strip()}."
        for s in " ".join(
            [
                substitue(item, replace).strip()
                for item in lines
                if len(item) > 0 and not check(item, ignore)
            ]
        ).split(".")
    ]
    print("Done!")
    return sentences


In [None]:
with open("./data/homer.raw.txt", "w") as f:
    for source_id in [0, 1]:
        text =  load(**sources[source_id])
        for line in text:
            print(line, file=f)

## Fine-tuning model 

Based on https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py

In [None]:
root_dir = Path("./model").resolve()
config_path = root_dir / "config"
model_path = root_dir / "weights"
tokenizer_path = root_dir / "tokenizer"
cache_dir = root_dir / ".cache"
output_dir = root_dir / ".outputs"
data_path = Path("./data").resolve() / "homer.raw.txt"

In [None]:
# load dataset and process it
from datasets import load_dataset

data_files = {}
dataset_args = {}
data_files["train"] = str(data_path)
extension = "text"
dataset_args["keep_linebreaks"] = True
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=cache_dir, **dataset_args)

# train:val split = 80:20
validation_split_percentage = 20
raw_datasets["validation"] = load_dataset(
    extension,
    data_files=data_files,
    split=f"train[:{validation_split_percentage}%]",
    cache_dir=cache_dir,
    **dataset_args,
)
raw_datasets["train"] = load_dataset(
    extension,
    data_files=data_files,
    split=f"train[{validation_split_percentage}%:]",
    cache_dir=cache_dir,
    **dataset_args,
)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load pretrained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast_tokenizer=True, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_path, config=config_path, cache_dir=cache_dir)
model.resize_token_embeddings(len(tokenizer))

In [None]:
from transformers import TextGenerationPipeline

# generate text from prefix before fine-tuning
device = -1 if model.device.type == "cpu" else model.device.index
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=device)
print(text_generator("The war in")[0]["generated_text"])
print(text_generator("The market in America")[0]["generated_text"])

In [None]:
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=column_names,
    desc="Running tokenizer on dataset",
)

#block_size = tokenizer.model_max_length
block_size = 256

In [None]:
from itertools import chain

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    desc=f"Grouping texts in chunks of {block_size}",
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

from datasets import load_metric
metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics but we need to shift the labels
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)
    return metric.compute(predictions=preds, references=labels)

In [None]:
from transformers import TrainingArguments

# initialize traing arguments
training_args = TrainingArguments(
    output_dir=str(output_dir), 
    do_train=True, 
    do_eval=True,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    eval_accumulation_steps = 1,
    num_train_epochs = 20
)

In [None]:
from transformers import Trainer, default_data_collator, is_torch_tpu_available

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    # Data collator will default to DataCollatorWithPadding, so we change it.
    data_collator=default_data_collator,
    compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
    if training_args.do_eval and not is_torch_tpu_available()
    else None,
)

In [None]:
last_checkpoint = None

In [None]:
# train
if training_args.do_train:
    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics

    max_train_samples = len(train_dataset)
   
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

In [None]:
import math

# eval
if training_args.do_eval:
    metrics = trainer.evaluate()
    max_eval_samples = len(eval_dataset)
    metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
    perplexity = math.exp(metrics["eval_loss"])
    metrics["perplexity"] = perplexity

metrics

In [None]:
from transformers import TextGenerationPipeline

# generate text from prefix after fine-tuning
device = -1 if model.device.type == "cpu" else model.device.index
text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=device)

print(text_generator("The war in")[0]["generated_text"])
print(text_generator("The market in America")[0]["generated_text"])