# 1. Requirement Installation

In [1]:
!apt install subversion
!pip install transformers

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libapr1 libaprutil1 libserf-1-1 libsvn1
Suggested packages:
  db5.3-util libapache2-mod-svn subversion-tools
The following NEW packages will be installed:
  libapr1 libaprutil1 libserf-1-1 libsvn1 subversion
0 upgraded, 5 newly installed, 0 to remove and 34 not upgraded.
Need to get 2,237 kB of archives.
After this operation, 9,910 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libapr1 amd64 1.6.3-2 [90.9 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libaprutil1 amd64 1.6.1-2 [84.4 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libserf-1-1 amd64 1.3.9-6 [44.4 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd6

# 2. Imports

In [2]:
import os
import re
import tarfile
import shutil
import tarfile
import numpy as np
import pandas as pd
import unicodedata as ud
import csv
from google.colab import drive
from transformers import pipeline
from transformers import GPT2Tokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead, GPT2LMHeadModel
import math
import torch
from google.colab import files
from sklearn.model_selection import train_test_split
from tqdm import tqdm
device = 'cuda:0'

# 3. Data

## 3.1. Download Datasets

In [3]:
!svn checkout https://github.com/Alicia6N/movie_generator/trunk/datasets 

A    datasets/anime_clean.csv
A    datasets/merged_dataset_descriptions.csv
A    datasets/merged_dataset_titles.csv
A    datasets/movies_metadata.csv
A    datasets/netflix_titles.csv
A    datasets/tmdb_5000_movies.csv
Checked out revision 5.


## 3.2. Tokenizer, Train and Test Datasets

In [4]:
merged_dataset = pd.read_csv('datasets/merged_dataset_descriptions.csv')
merged_dataset = merged_dataset.sample(frac=1).reset_index(drop=True)

def build_text_files(data, filename):
    f = open(filename, 'w')
    data = ''

    for row in merged_dataset['overview']:
        aux = str(row).strip()
        aux = re.sub(r"\s", " ", aux)
        data += aux + "  "
    f.write(data)

train, test = train_test_split(merged_dataset, test_size=0.15)
build_text_files(train, 'train_dataset.txt')
build_text_files(test, 'test_dataset.txt')

print("Train dataset length: ", len(train))
print("Test dataset length: ", len(test))

Train dataset length:  26421
Test dataset length:  4663


In [5]:
merged_dataset.head()

Unnamed: 0,title,overview
0,Quel maledetto giorno della resa dei conti,George Benton returns from school to his broth...
1,La Bionda,A young shy man (Tommaso) runs down a blond gi...
2,The Romantic Englishwoman,What is real and what is fiction? Faced with w...
3,Big Money Rustlas,The Insane Clown Posse heads back to the Wild ...
4,My Babysitter's a Vampire,"""My Babysitter's a Vampire,"" a comedic spin on..."


In [6]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=train_path,
                                block_size=128)
    
    test_dataset = TextDataset(tokenizer=tokenizer,
                               file_path=test_path,
                               block_size=128)
    
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                    mlm=False)
    
    return train_dataset, test_dataset, data_collator

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…






# 4. GPT-2 Movie Description Generator Model

## 4.1. Model Generation And Configuration

In [7]:
load_trained = True
#tar_name = 'trained_gpt_medium.tar'
#tar_gdrive_id = '1-eRePCWxcHnTt6Tf_mchxJxi8F3a2KYd'
tar_name = 'trained_gpt_medium_4rd.tar'
tar_gdrive_id = '1gy5MyYAdr7JBwcNou3pAdppKhCmns5ID'
model_path = 'gpt-model'
model = None

if load_trained:
    print('Downloading finetuned model.')
    if not os.path.isfile(tar_name):
        !gdown --id {tar_gdrive_id}

    tar = tarfile.open(tar_name, "r:")
    tar.extractall()
    tar.close()

    !rm {tar_name}

    model = GPT2LMHeadModel.from_pretrained(model_path, local_files_only=True, pad_token_id=tokenizer.eos_token_id)
    

else:
    print('Downloading default pretrained model.')
    model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model.to(device)

Downloading finetuned model.
Downloading...
From: https://drive.google.com/uc?id=1gy5MyYAdr7JBwcNou3pAdppKhCmns5ID
To: /content/trained_gpt_medium_4rd.tar
1.44GB [00:28, 50.3MB/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

## 4.2. Training Configuration And Training

In [8]:
def save_checkpoint_to_gdrive(copy_folder=False):
    drive.mount('/content/drive')

    checkpoint_folder = os.path.join('trained_gpt_medium_3rd')

    if copy_folder:
        shutil.copytree(checkpoint_folder, "/content/drive/My Drive/" + checkpoint_folder)
    else:
        file_path = checkpoint_folder.replace(os.path.sep, '_') + '.tar'
        print(file_path)

        with tarfile.open(file_path, 'w') as tar:
            tar.add('gpt-model/pytorch_model.bin')
            tar.add('gpt-model/config.json')

        shutil.copyfile(file_path, "/content/drive/My Drive/" + file_path)

In [9]:

training_args = TrainingArguments(
    output_dir = 'gpt-model', #The output directory
    overwrite_output_dir = True, #overwrite the content of the output directory
    num_train_epochs = 8, # number of training epochs
    per_device_train_batch_size = 8, # batch size for training
    per_device_eval_batch_size = 8,  # batch size for evaluation
    eval_steps = 500, # Number of update steps between two evaluations.
    save_steps = 10000, # after # steps model is saved
    warmup_steps = 400,
    evaluation_strategy="steps") 
# https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = test_dataset)


In [None]:
if not load_trained:
  trainer.train()
  trainer.save_model()
  save_checkpoint_to_gdrive()

## 4.3. Configuration Testing

In [10]:
def truncate(output):
    sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    index = max(sentence.rfind(i) for i in '!?.')
    sentence = sentence[:index+1]
    return sentence

In [11]:
starters = ['After a zombie outbreak in Las Vegas',
            'It is the year 2077',
            'The human colony on Andromeda',
            'In the deep titanium mines on Alpha Centauri',
            #'At the dawn of the 31st millennium, the Imperium of Man',
            'After a series of harrowing murders',
            'When five college kids arrive at a remote forest cabin',
            #'Mega City One is a vast, violent metropolis',
            'During World War 2',
            'Humanity has mastered interplanetary spaceflight and begun to explore the galaxy.'
            ]

## HABLAR SOBRE TOP-K SAMPLING DE AQUI
#https://huggingface.co/blog/how-to-generate


tmps = [0.7, 0.8, 0.9, 1]
top_ks = [20, 50, 100, 200, 400]
top_ps = [0.85, 0.9, 0.95, 1]

def generate_movies(folder='generated_movies/', download=True):
  if os.path.exists(folder):
    !rm -rf 'generated_movies/'

  for tmp in tmps:
    path = os.path.join(folder, 'tmp_'+str(tmp))
    os.makedirs(path, exist_ok = True)

    for top_k in top_ks:
      for top_p in top_ps:
        file_name =  'topk_' + str(top_k) + '_topp_' + str(top_p) + '.txt'
        f = open(os.path.join(path, file_name), 'w')
        print('Generating new movie descriptions with config:', end='')
        print(' tmp={}, top_k={}, top_p={}'.format(tmp, top_k, top_p))
        aux = ''

        for i, starter in enumerate(starters):
          input = tokenizer.encode(starter, return_tensors='pt').to(device)
          output = model.generate(input, min_length=50, 
                          max_length=70, repetition_penalty=1,
                          temperature=tmp, do_sample=True, top_k=top_k)                
          movie_desc = truncate(output)



          aux += '{}\n'.format(movie_desc)
        f.write(aux)
        f.close()
  if download:
    print("Downloading all generations to folder '{}' in drive".format(folder))
    !zip -r /content/generated_movies.zip /content/generated_movies
    files.download("/content/generated_movies.zip")

generate_movies(folder='generated_movies/')

Generating new movie descriptions with config: tmp=0.7, top_k=20, top_p=0.85


KeyboardInterrupt: ignored

In [12]:
def evaluate_perplexity(sentence):
  # Perplexity
  # https://datascience.stackexchange.com/questions/38540/are-there-any-good-out-of-the-box-language-models-for-python
  model.eval()
  tokenize_input = tokenizer.tokenize(sentence)
  tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
  loss=model(tensor_input.to(device), labels=tensor_input.to(device))
  return math.exp(loss[0])

sentence = "It is the year 2077, a decade after the events of the original Gundam series, and Earth is in the throes of a major energy crisis."
evaluate_perplexity(sentence)

24.091607573185218

In [13]:
def evaluate_test(dataset):
  result = trainer.evaluate(test_dataset)
  return math.exp(result['eval_loss'])
loss = evaluate_test(test_dataset)
print(loss)

11.815751876497622


In [None]:
def evaluate_generations(folder='generated_movies/', 
                         output_file = 'movies_evaluation.csv', download=True):
  print("Writing file '{}'...".format(output_file))
  with open(output_file, 'w', encoding='UTF8', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=["Description", "PPL", "Tmp", "TopK", "TopP"])
    writer.writeheader()
    for subfolder in os.listdir(folder):
      if subfolder.startswith('tmp'):
        temperature = subfolder.split('_')[1]
        path = os.path.join(folder, subfolder)
        for file_ in os.listdir(path):
            topk = file_[:-4].split('_')[1]
            topp = file_[:-4].split('_')[3]
            file_path = os.path.join(path, file_)
            f = open(file_path, 'r')
            movies = f.readlines()
            rows  = []
            for movie_desc in movies:
              movie_desc = ' '.join(movie_desc.split())
              perplexity = evaluate_perplexity(movie_desc)
              movie_data = {'PPL': perplexity, 'Tmp': temperature, 
                            'TopK': topk, 'Description': movie_desc, 
                            'TopP': topp}
              rows.append(movie_data)
            writer.writerows(rows)
  if download:
    print("Dowloading file to drive...")
    files.download("/content/movies_evaluation.csv")
  print("Done!")
evaluate_generations(folder='generated_movies/')

Writing file 'movies_evaluation.csv'...
Dowloading file to drive...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Done!
