In [None]:
import pandas as pd
import re
import numpy as np
from collections import Counter, defaultdict
from nltk.util import ngrams
import os
import tensorflow as tf
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset

from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/results

/content/drive/MyDrive/results


In [None]:
#Import GPT2's tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('.')
tokenizer.pad_token = tokenizer.eos_token

GPT2model = GPT2LMHeadModel.from_pretrained('.')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
GPT2model.to(device)
GPT2model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
#Test out model and tokenizer with a prompting example
sequence = "write a movie review:"

inputs = tokenizer.encode(sequence, return_tensors='pt')
attention_mask = torch.ones(inputs.shape, dtype=torch.long)

inputs = inputs.to(device)
attention_mask = attention_mask.to(device)

In [None]:
gentext = []

for i in range(5000):
  with torch.no_grad():
    outputs = GPT2model.generate(
        inputs,
        attention_mask=attention_mask,  # Add the attention mask here
        do_sample=True,
        top_k=100,
        pad_token_id=tokenizer.pad_token_id,  # Explicitly set the pad token ID
        max_new_tokens = 200,
        min_new_tokens = 100,
        temperature = 0.7,
        top_p = 0.95,
        no_repeat_ngram_size = 3
        )
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  generated_text.rsplit('.', 1)[0] + '.'
  gentext.append(generated_text)
  if i % 100 == 0:
    print(i, '/', 5000)

#gentext_df = pd.DataFrame(gentext, columns = ['review'])
#gentext_df['prompt'] = sequence
#gentext_df['model'] = 'distilgpt2'
gentext_df = pd.DataFrame({'model': 'distilgpt2_finetuned', 'prompt': sequence, 'review': gentext})

gentext_df.to_csv('distilgpt_finetuned_reviews.csv')

0 / 5000
100 / 5000
200 / 5000
300 / 5000
400 / 5000
500 / 5000
600 / 5000
700 / 5000
800 / 5000
900 / 5000
1000 / 5000
1100 / 5000
1200 / 5000
1300 / 5000
1400 / 5000
1500 / 5000
1600 / 5000
1700 / 5000
1800 / 5000
1900 / 5000
2000 / 5000
2100 / 5000
2200 / 5000
2300 / 5000
2400 / 5000
2500 / 5000
2600 / 5000
2700 / 5000
2800 / 5000
2900 / 5000
3000 / 5000
3100 / 5000
3200 / 5000
3300 / 5000
3400 / 5000
3500 / 5000
3600 / 5000
3700 / 5000
3800 / 5000
3900 / 5000
4000 / 5000
4100 / 5000
4200 / 5000
4300 / 5000
4400 / 5000
4500 / 5000
4600 / 5000
4700 / 5000
4800 / 5000
4900 / 5000
