# Generation

In this file, we train a transformer to be able to generate speeches.

In [None]:
from datasets import Dataset
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

import sys
sys.path.append("../data")
from load_data import load_data

SAVE_PATH = "../saved_models"

In [None]:
data = load_data()

In [None]:
parties = ["extreme gauche", "gauche", "centre", "droite", "extreme droite"]
def format_data(row):
    return f"<|party|>{parties[row["speaker"]]}\n<|speech|>{row["text"]}<|endoftext|>"

data["formatted"] = data.apply(format_data, axis=1)
dataset = Dataset.from_pandas(data[["formatted"]])

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
special_tokens = ["<|party|>", "<|speech|>", "<|endoftext|>"]
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

def tokenize(example):
    return tokenizer(example['formatted'], truncation=True, padding='max_length', max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

tokenizer.save_pretrained(f"{SAVE_PATH}/generative_tokenizer")

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model.to("cuda" if torch.cuda.is_available() else "cpu")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

trainer.train()

model.save_pretrained(f"{SAVE_PATH}/generative_model")