In [2]:
import datasets as ds
import torch as t
import transformers as tfs
import re
from sklearn.model_selection import train_test_split

# Preprocessing

In [3]:
conv_ai_dataset = ds.load_dataset('conv_ai')
train_dataset = conv_ai_dataset['train']

Reusing dataset conv_ai (C:\Users\Martin\.cache\huggingface\datasets\conv_ai\conv_ai\1.0.0\ef0f1d9a027f9f8494c5fccc54e32331f4ab4db4a1aa24be00943ce77f49a905)


In [4]:
train_test_ratio = 0.9
train_eval_ratio = 0.8
train_data, test_data = train_test_split(train_dataset, train_size=train_test_ratio, random_state=1)

def build_dataset(df, dest_path):
    f = open(dest_path, 'w',encoding='utf-8' )
    data = ''
    texts = df['context']
    for text in texts:

        text = str(text).strip()
        text = re.sub(r"\s", " ", text)
        bos_token = '<BOS>'
        eos_token = '<EOS>'
        data += bos_token + ' ' + text + ' ' + eos_token + '\n'
        
    f.write(data)
    f.close
build_dataset(train_data, "train.txt")
build_dataset(test_data, "test.txt")



In [5]:
def read_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        texts = f.readlines()
    return texts

train_data = read_text("train.txt")
test_data = read_text("test.txt")
train_data, eval_data = train_test_split(train_data, train_size=train_eval_ratio, random_state=1)

print(max([len(data) for data in train_data]))

3398


In [6]:
tokenizer = tfs.GPT2TokenizerFast.from_pretrained("gpt2")
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [7]:
train_encodings = tokenizer(train_data, truncation=True, padding=True)
test_encodigns = tokenizer(test_data, truncation=True, padding=True)
eval_encodings = tokenizer(eval_data, truncation=True, padding=True)

In [10]:
'''
class textDataset(t.utils.data.Dataset):
    def __init__(self, text, tokenizer):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attention_masks = []
        encodings = self.tokenizer(text, truncation=True, padding=True)
        self.input_ids.append(t.tensor(encodings['input_ids']))
        self.attention_masks.append(t.tensor(encodings['attention_mask']))
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_masks[idx]
    def __len__(self):
        return len(self.input_ids)

train_dataset = textDataset(train_data, tokenizer)
test_dataset = textDataset(test_data, tokenizer)
print(train_dataset)
'''
def load_text_dataset(train_path, test_path, tokenizer):
    train_dataset = tfs.LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=512
    )

    test_dataset  = tfs.LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=test_path,
        block_size=512
    )
    data_collator = tfs.DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm = False
    )
    return train_dataset, test_dataset, data_collator
train_dataset, test_dataset, data_collator = load_text_dataset("train.txt", "test.txt", tokenizer)
print(train_dataset)

<transformers.data.datasets.language_modeling.LineByLineTextDataset object at 0x000002AB4BE776D0>


# Training/Fine tuning

In [9]:
model = tfs.GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [11]:
training_args = tfs.TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=100, 
)
trainer = tfs.Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()


  1%|          | 2/313 [05:43<14:54:30, 172.57s/it]

# Text generation

In [97]:
text_generator = tfs.pipeline("text-generation", tokenizer = tokenizer, model=model)

In [101]:
def generate_text(prefix, tokens_pr_gen=20, generator=text_generator):
    encoded_input = tokenizer(prefix, add_special_tokens=True, return_tensors="pt")
    length = len(encoded_input["input_ids"][0])
    return generator(prefix, max_length = length+tokens_pr_gen, num_return_sequences=1, repetition_penalty=1.2)[0]["generated_text"]

In [None]:
base_generator = tfs.pipeline("text-generation", tokenizer = tokenizer, model='gpt2')

In [103]:
test_prefix = "It was a beautiful morning, but then"
print("trained")
print(generate_text(test_prefix))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
<BOS>Hello
