In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, Trainer, TrainingArguments
import pandas as pd
from tqdm.auto import tqdm
from torch.utils.data import Dataset, random_split
import torch

In [None]:
# # load pretrained
# model_id = "roneneldan/TinyStories-1M"
# model = AutoModelForCausalLM.from_pretrained(model_id)
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# prompt = "Once upon a time there was"
# input_ids = tokenizer.encode(prompt, return_tensors="pt")
# output = model.generate(input_ids, max_length=100, num_beams=1)
# output_text = tokenizer.decode(output[0], skip_special_tokens=True)
# print(output_text)

In [None]:
# load untrained
model_id = "roneneldan/TinyStories-1M"
config = AutoConfig.from_pretrained(model_id, local_files_only=True)
model = AutoModelForCausalLM.from_config(config)
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "Once upon a time there was"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids, max_length=100, num_beams=1)
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(output_text)

In [None]:
def clean_carr_ret(poem):
    return poem.replace("\r", "")
poems = pd.read_csv("data/PoetryFoundationData.csv")["Poem"].apply(clean_carr_ret)
poems.head()

In [None]:
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = tokenizer.eos_token

In [None]:
max_length = min(max([len(tokenizer.encode(p)) for p in tqdm(poems)]), 2048)
print(f"{max_length = }")

In [None]:
class PoemDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in tqdm(txt_list):
            encodings_dict = tokenizer(
                "<|startoftext|>" + txt + "<|endoftext|>",
                truncation=True,
                max_length=max_length,
                padding="max_length",
            )
            self.input_ids.append(torch.tensor(encodings_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encodings_dict["attention_mask"]))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = PoemDataset(poems, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [None]:
for d in dataset:
    if len(d) > 10:
        print(len(d))

In [None]:
training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 10,
    logging_steps=2,
    save_steps = 4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 1,
    warmup_steps=1,
    weight_decay=0.05,
    logging_dir = "./logs",
    report_to = "none",
)

In [None]:
Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = lambda data: {
        "input_ids": torch.stack([f[0] for f in data]),
        "attention_mask": torch.stack([f[1] for f in data]),
        "labels": torch.stack([f[0] for f in data])
    }
).train()

In [None]:
model = torch.load("./model.pth")
generated = tokenizer("<|startoftext|>", return_tensors="pt").input_ids

In [None]:
sample_outputs = model.generate(
    generated,
    do_sample=True,
    top_k=5,
    max_length=50,
    top_p=0.95,
    temperature=1,
    num_return_sequences=2000,
)

for i, sample_output in enumerate(sample_outputs):
    print(f"{i}: {tokenizer.decode(sample_output, skip_special_tokens=True)}")

In [None]:
torch.save(model, "model.pt")

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [2]:
model_id = "Helsinki-NLP/opus-mt-en-de"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [30]:
prompts = "Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong. One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn. Beep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after."
# prompts = [s + "." for s in story.split(". ")][:-1]
# for p in prompts:
#     print(p)
print("")
print("encoding")
encodings = tokenizer(prompts, return_tensors="pt", padding=True)
# print(encodings)
print("translating")
trans_codes = model.generate(**encodings)
# print(trans_codes)
print("decoding")
translations = [tokenizer.decode(t, skip_special_tokens=True) for t in trans_codes]
print("")
for t in translations:
    print(t)


encoding
translating
decoding

Einmal gab es ein kleines Auto namens Beep. Beep liebte es, schnell zu gehen und in der Sonne zu spielen. Beep war ein gesundes Auto, weil er immer guten Treibstoff hatte. Guter Treibstoff machte Beep glücklich und stark. Eines Tages fuhr Beep im Park, als er einen großen Baum sah. Der Baum hatte viele Blätter, die fielen. Beep mochte, wie die Blätter fallen und mit ihnen spielen wollte. Beep fuhr unter dem Baum und beobachtete, wie die Blätter auf ihn fielen. Er lachte und piepste sein Horn. Beep spielte mit den fallenden Blättern den ganzen Tag. Als es Zeit war, nach Hause zu gehen, wusste Beep, dass er mehr Treibstoff brauchte. Er ging zum Kraftstoffplatz und bekam mehr gesunden Treibstoff. Nun war Beep bereit, schnell zu gehen und am nächsten Tag wieder zu spielen. Und Beep lebte glücklich bis ans Ende.


In [17]:
from datasets import load_dataset
from tqdm.auto import tqdm

In [18]:
dataset = load_dataset("roneneldan/TinyStories", split="train")

Repo card metadata block was not found. Setting CardData to empty.


In [17]:
translator = Translator()
story = "Hallo, das ist ein kleiner text."
translator.translate(story)

AttributeError: 'NoneType' object has no attribute 'group'

In [7]:
with open("TinyStoriesTranslate\TS_train_de_0_10000.csv", "r") as f:
    lines = f.readlines()
    for line in lines[:100]:
        for sentence in line.split(". ")[:-1]:
            print(sentence + ". ")
        print("------------------------------------------------------------")

Eines Tages fand ein kleines Mädchen namens Lily eine Nadel in ihrem Zimmer. 
Sie wusste, dass es schwierig war, damit zu spielen, weil es scharf war. 
Lily wollte die Nadel mit ihrer Mutter teilen, so dass sie einen Knopf auf ihr Hemd nähen konnte. 
Lily ging zu ihrer Mutter und sagte: "Mama, ich fand diese Nadel. 
Kannst du sie mit mir teilen und mein Hemd nähen?" Ihre Mutter lächelte und sagte: "Ja, Lily, wir können die Nadel teilen und dein Hemd fixieren." Zusammen teilten sie die Nadel und nähten die Taste auf Lilys Hemd. 
Es war nicht schwierig für sie, weil sie miteinander teilten und einander halfen. 
Nachdem sie fertig waren, dankte Lily ihrer Mutter für das Teilen der Nadel und das Fixieren ihres Hemdes. 
------------------------------------------------------------
Einmal gab es ein kleines Auto namens Beep. 
Beep liebte es, schnell zu gehen und in der Sonne zu spielen. 
Beep war ein gesundes Auto, weil er immer guten Treibstoff hatte. 
Guter Treibstoff machte Beep glücklich 

In [10]:
from tqdm.auto import tqdm

In [13]:
for i, line in enumerate(lines):
    if "nix nix nix" in line:
        print(i, line)

951 Er hat sich germant und nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix nix n



In [22]:
for i, line in enumerate(dataset[950]["text"].split(". ")):
    print(line, "            ", lines[950].split(". ")[i])

Ben and Lily were playing in the park              Ben und Lily waren gerade los und wollten nicht, dass sie im Park spielen
They liked to slide, swing and run              Sie liebten es zu rutschen, schwingen und laufen
But they also liked to search for things              Aber sie suchten auch gerne nach Dingen
They searched for bugs, flowers and rocks              Sie suchten nach Käfern, Blumen und Felsen
Sometimes they found something special, like a shiny coin or a feather.

One day, they saw a big hill              Manchmal fanden sie etwas Besonderes, wie eine glänzende Münze oder eine Feder
They wanted to climb it and see what was on the other side              Eines Tages sahen sie einen großen Hügel
They asked their mom if they could go              Sie wollten ihn klettern und sehen, was auf der anderen Seite war
She said yes, but be careful and come back soon              Sie fragten ihre Mutter, ob sie gehen könnten
Ben and Lily ran to the hill and started to climb      

IndexError: list index out of range

In [20]:
print(dataset[951]["text"])
print(lines[951])

Lily and Max were at the zoo with their mom. They wanted to see the penguins. They liked how they looked in their black and white coats. They followed the signs to the penguin house.

When they got there, they saw a big pool of water with ice and rocks. There were many penguins in the water and on the land. Some of them were swimming and diving. Some of them were standing and flapping their wings. Some of them were marching in a line.

"Look, mom, they are marching!" Lily said. "They look like soldiers!"

"Yes, they do," mom said. "They march to keep warm and to stay together. It is very cold where they live."

"Can we march like them?" Max asked.

"Sure, why not?" mom said. "Let's march around the pool and see if they notice us."

Lily and Max started to march like the penguins. They lifted their feet high and swung their arms. They made funny noises with their mouths. They had fun.

But one penguin did not like their marching. He was the leader of the penguin group. He thought they w