<a href="https://colab.research.google.com/github/Felipehonorato1/NaturalLanguageProcessing/blob/main/GPT2Finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture 
!pip install transformers
#!pip install datasets
#!unzip tripadvisor_hotel_reviews.zip

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount = False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!nvidia-smi

Wed Mar 24 15:13:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P8    12W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [34]:
from transformers import GPT2Tokenizer, AutoModelWithLMHead, set_seed, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling, EarlyStoppingCallback
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
set_seed(301)

In [5]:
def build_text_file(sentences, filename):
  with open(filename, 'w') as handle:
    handle.write("<|endoftext|>".join(sentences))
  

In [6]:
df = pd.read_csv('tripadvisor_hotel_reviews.csv', sep =',')
sentences = df['Review'].values

In [7]:
train_sents, val_sents = train_test_split(sentences, test_size = 0.2, random_state = 301)

In [8]:
build_text_file(train_sents, filename = 'train.txt')
build_text_file(val_sents, filename = 'val.txt')

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

In [10]:
def load_dataset(train_path, test_path, tokenizer):
  train_dataset = TextDataset(tokenizer = tokenizer, file_path= train_path, block_size = 128)
  test_dataset = TextDataset(tokenizer = tokenizer, file_path= test_path, block_size = 128)
  collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)

  return train_dataset, test_dataset, collator

In [11]:
train_dataset, test_dataset, collator = load_dataset(train_path= 'train.txt', test_path='val.txt', tokenizer= tokenizer)





---


# TRAINING STEP

In [42]:
training_args = TrainingArguments(output_dir = '/content/drive/MyDrive/GPT2Out/gpt2hotelreviews',num_train_epochs = 50, 
                                  per_device_train_batch_size = 32, per_device_eval_batch_size = 64, 
                                  overwrite_output_dir = True,
                                  warmup_steps = 500, evaluation_strategy = 'epoch',
                                  save_steps=-1, load_best_model_at_end=True)

In [43]:
%%capture
model = AutoModelWithLMHead.from_pretrained('distilgpt2')

In [44]:
early_stop = EarlyStoppingCallback()

In [45]:
trainer = Trainer(train_dataset = train_dataset, eval_dataset= test_dataset, data_collator = collator ,model = model, args= training_args, callbacks=[early_stop])

In [47]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,6.961914,2.0741,116.195
2,No log,6.405564,2.0067,120.1
3,No log,6.035982,2.0963,114.966
4,No log,5.785758,2.0312,118.65
5,No log,5.629687,2.0725,116.285
6,No log,5.535892,2.0412,118.07
7,No log,5.46901,2.0703,116.408
8,No log,5.414587,2.0542,117.323
9,No log,5.375424,2.0551,117.269
10,No log,5.332632,2.047,117.732


TrainOutput(global_step=648, training_loss=5.534030113691165, metrics={'train_runtime': 626.0814, 'train_samples_per_second': 2.875, 'total_flos': 1303345727668224.0, 'epoch': 18.0, 'init_mem_cpu_alloc_delta': 1865351, 'init_mem_gpu_alloc_delta': 334731264, 'init_mem_cpu_peaked_delta': 514229, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 518667, 'train_mem_gpu_alloc_delta': 1328507392, 'train_mem_cpu_peaked_delta': 154613225, 'train_mem_gpu_peaked_delta': 5607165952})

In [51]:
trainer.save_model()



---


# GENERATING 

In [52]:
from transformers import pipeline

reviewer = pipeline('text-generation', model = '/content/drive/MyDrive/GPT2Out/gpt2hotelreviews/', tokenizer = 'distilgpt2', config={'max_length':800})

In [53]:
reviewer('really enjoyed')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'really enjoyed 5-10 night stay casablanca nice hotel, wonderful staff, highly recommended.thank you casablanca perfect.we highly recommend staying,  '}]