In [1]:
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelWithLMHead, pipeline

### Load poetries text

In [2]:
#%% open poetries file and keep only authors with more than 5 poetries
filename = os.path.join('data_collection', 'poestries_dict.pkl')
with open(filename, 'rb') as f:
    poetries_dict = pickle.load(f)

# define authors to keep
authors_to_keep = [
    "JOHN KEATS",
    "JOHN DONNE",
    "LUIGI PIRANDELLO",
    "ALDO PALAZZESCHI",
    "ANNA ACHMATOVA",
    "GIACOMO LEOPARDI",
    "GIUSEPPE PARINI",
    "SAFFO",
    "EDMONDO DE AMICIS",
    "FRANCESCO PETRARCA",
    "WILLIAM WORDSWORTH",
    "ROBERT FROST",
    "DINO BUZZATI",
    "MARCEL PROUST",
    "VOLTAIRE",
    "GUILLAUME APOLLINAIRE",
    "EZRA POUND",
    "JAMES JOYCE",
    "GIUSEPPE UNGARETTI",
    "SALVATORE QUASIMODO",
    "WILLIAM BLAKE",
    "JORGE LUIS BORGES",
    "PRIMO LEVI",
    "GABRIELE D ANNUNZIO",
    "PAULO COELHO",
    "EMILY DICKINSON",
    "CHARLES BUKOWSKI",
    "UMBERTO SABA",
    "SAN PAOLO",
    "FRIEDRICH SCHILLER",
    "ARRIGO BOITO",
    "WILLIAM SHAKESPEARE",
    "CORRADO GOVONI",
    "WILLIAM BUTLER YEATS",
    "EDGAR ALLAN POE",
    "VICTOR HUGO",
    "ITALO CALVINO",
    "ADA NEGRI",
    "CARLO BETOCCHI",
    "CESARE PAVESE",
    "GIOVANNI PASCOLI",
    "CHARLES BAUDELAIRE",
    "JACK KEROUAC",
    "GUIDO CAVALCANTI",
    "CAIO VALERIO CATULLO",
    "FRANCESCO D ASSISI",
    "EDUARDO DE FILIPPO",
    "THOMAS STEARNS ELIOT",
    "NICCOLO UGO FOSCOLO",
    "OSCAR WILDE",
    "EUGENIO MONTALE",
    "DANTE ALIGHIERI",
    "PABLO NERUDA",
    "ARTHUR RIMBAUD",
    "ALESSANDRO MANZONI",
    "RUDYARD KIPLING",
    "ANNA FRANK",
    "ALDA MERINI",
    "PIER PAOLO PASOLINI",
    "LEWIS CARROLL",
    "GIOSUE CARDUCCI",
    "GIORGIO CAPRONI",
    "MICHELANGELO BUONARROTI"
]

poetries = []
author_list = []
for key in poetries_dict:
    author = key.replace('-', ' ').upper()
    if author in authors_to_keep:
        for p in poetries_dict[key]:
            poetries.append(p)
            author_list.append(author)

print('-------------------------')
print('Totale poesie processate: {}'.format(len(poetries)))
print('Lista autori:')
for author in set(author_list):
    print(author)
print('-------------------------')

#%% prepare dataset

table = str.maketrans('', '', '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~»—…¹”¨«‘“¬ˆ')
for i in range(len(poetries)):
    poetries[i] = poetries[i].lower()
    poetries[i] = poetries[i].replace("\r", "")
    poetries[i] = poetries[i].replace("\n", " \n ")
    poetries[i] = poetries[i].replace("  ", " ")
    poetries[i] = poetries[i].replace("â€™", "'")
    poetries[i] = poetries[i].replace("’", " ")
    # poetries[i] = poetries[i].replace(",", " , ")
    poetries[i] = poetries[i].replace(".", ". ")
    poetries[i] = poetries[i].replace("  ", " ")
    poetries[i] = poetries[i].replace("ú", "ù")
    poetries[i] = poetries[i].replace("ã", "a")
    poetries[i] = poetries[i].replace("â", "a")
    poetries[i] = poetries[i].replace("í", "ì")
    poetries[i] = poetries[i].replace("ô", "o")
    poetries[i] = poetries[i].replace("a©", "è")
    poetries[i] = poetries[i].replace("ï", "i")
    poetries[i] = poetries[i].translate(table)
poetries = np.array(poetries)
authors = np.array(author_list)


def build_text_files(poetries, dest_path):
    data = ''
    for text in poetries:
        data = data + "<|endoftext|>" + text
    with open(dest_path, 'w', encoding='utf-8') as f:
        f.write(data)

train, test = train_test_split(poetries, test_size=0.15)

build_text_files(train, os.path.join('data_collection', 'train_dataset.txt'))
build_text_files(test, os.path.join('data_collection', 'test_dataset.txt'))

print("Train dataset length: " + str(len(train)))
print("Test dataset length: " + str(len(test)))

-------------------------
Totale poesie processate: 766
Lista autori:
CAIO VALERIO CATULLO
GIUSEPPE UNGARETTI
JACK KEROUAC
CARLO BETOCCHI
ALESSANDRO MANZONI
SALVATORE QUASIMODO
JAMES JOYCE
FRANCESCO D ASSISI
PABLO NERUDA
SAFFO
WILLIAM BUTLER YEATS
ARRIGO BOITO
EDGAR ALLAN POE
GIACOMO LEOPARDI
JORGE LUIS BORGES
NICCOLO UGO FOSCOLO
OSCAR WILDE
RUDYARD KIPLING
VICTOR HUGO
UMBERTO SABA
ANNA ACHMATOVA
VOLTAIRE
ITALO CALVINO
WILLIAM WORDSWORTH
DANTE ALIGHIERI
MICHELANGELO BUONARROTI
ANNA FRANK
EDUARDO DE FILIPPO
ARTHUR RIMBAUD
GIORGIO CAPRONI
DINO BUZZATI
MARCEL PROUST
FRIEDRICH SCHILLER
CHARLES BAUDELAIRE
ALDA MERINI
EMILY DICKINSON
EDMONDO DE AMICIS
CESARE PAVESE
SAN PAOLO
PRIMO LEVI
GIOSUE CARDUCCI
JOHN KEATS
PIER PAOLO PASOLINI
LUIGI PIRANDELLO
WILLIAM BLAKE
GUILLAUME APOLLINAIRE
ADA NEGRI
PAULO COELHO
ROBERT FROST
GIUSEPPE PARINI
JOHN DONNE
WILLIAM SHAKESPEARE
GABRIELE D ANNUNZIO
ALDO PALAZZESCHI
EZRA POUND
GUIDO CAVALCANTI
THOMAS STEARNS ELIOT
FRANCESCO PETRARCA
CORRADO GOVONI
CHARLES 

### Load model and tokenizer from pre-trained small italian GPT2 

In [3]:
model = AutoModelWithLMHead.from_pretrained("GroNLP/gpt2-small-italian")
tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-italian")

train_path = os.path.join('data_collection', 'train_dataset.txt')
test_path = os.path.join('data_collection', 'test_dataset.txt')

# train_encoded = tokenizer.encode('\n'.join([text for text in train]))
# train_decoded = tokenizer.decode(train_encoded)
# with open(os.path.join('data_collection', 'train_dataset_encoded.txt'), 'w', encoding='utf-8') as f:
#     f.write(train_decoded)

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



### Defining fine-tuning parameters and train model

In [4]:

training_args = TrainingArguments(
    output_dir=os.path.join("models", "gpt2-poetries"), #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=64,  # batch size for evaluation
    eval_steps=200, # Number of update steps between two evaluations.
    save_steps=200, # after # steps model is saved
    warmup_steps=200, # number of warmup steps for learning rate scheduler
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 1750
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 165
 52%|█████▏    | 85/165 [50:21<54:18, 40.73s/it]  

### Test model output with custom prompt

In [None]:
pipe = pipeline("text-generation", model=os.path.join("models", "gpt2-poetries"), tokenizer=tokenizer)
result = pipe("il tuo sorriso è come")[0]['generated_text']
print(result)