In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 42.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.50.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 45.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
Collecting click==8.0
  Downloading click-8.0.0-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 5.7 MB/s 
Building

# Imports

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer,  AutoModelWithLMHead, get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, DistributedSampler
from tqdm import trange, tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import json
import os
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn import functional as F

from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import AdamW,  get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.notebook import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Load dataset

In [None]:
import pandas as pd 

In [None]:
data = pd.read_csv('/content/drive/My Drive/NLP_humor/jokes_dataset_MAIN_PREPROC.csv', 
                 sep=',', 
                 encoding='utf-8')

# data = pd.read_csv('/content/drive/My Drive/NLP_humor/train_val_test/just_title/jokes_df_TITLE.csv', 
#                  sep=',', 
#                  encoding='utf-8')

In [None]:
data.head()

Unnamed: 0,joke,rank
0,I hate how you cannot even say black paint any...,1
1,What is the difference between a Jew in Nazi G...,0
2,I recently went to America....,0
3,"Brian raises his hand and says, ""He is in Heav...",1
4,You hear about the University book store worke...,0


In [None]:
data.dropna(inplace=True)

In [None]:
data.isna().sum()

id      0
joke    0
rank    0
dtype: int64

In [None]:
data['Text_length'] = data.joke.str.split().str.len()

In [None]:
df = data[(data['rank'] == 4) & (data['Text_length'] >= 8) & (data['Text_length'] <=100)]

In [None]:
df.shape

(28415, 4)

In [None]:
df = df.sample(n=5000, random_state=42)

In [None]:
# with open("/content/drive/My Drive/NLP_jokes_generation/data/reddit_preproc.json", "r") as read_file:
#     jokes = json.load(read_file)

In [None]:
# jokes_5_score = []
# for joke in jokes:
#     if joke['score'] == 5:
#         jokes_5_score.append(joke)

In [None]:
class JokesDataset(Dataset):
    def __init__(self, jokes, tokenizer, max_length):

        self.jokes = jokes
        self.input_ids = []
        self.attn_masks = []

        for _,joke in jokes.iterrows():
            
            encodings_dict = tokenizer('<SOS> '+ joke['joke'] + ' <EOS>', truncation=True, max_length=max_length, padding='max_length')

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<SOS>', eos_token='<EOS>')
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
dataset = JokesDataset(df, tokenizer, max_length=100)

In [None]:
train_idx, valid_idx= train_test_split(np.arange(len(dataset)),
                                        test_size=0.2,
                                        shuffle=True, 
                                        random_state=42)

train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
valid_sampler = torch.utils.data.SubsetRandomSampler(valid_idx)

dataloaders = {'train': torch.utils.data.DataLoader(dataset, batch_size=3, sampler=train_sampler),
'val': torch.utils.data.DataLoader(dataset, batch_size=2,sampler=valid_sampler)}

dataset_sizes = {'train': len(train_idx), 'val': len(valid_idx)}

# Training

In [None]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
model.train()

# optimizer = AdamW(model.parameters(), lr=5e-3, eps=1e-8)  # var4 ужасныее скоры
# optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)# var4

optimizer = AdamW(model.parameters(), lr=5e-4, eps=1e-8)# var5
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10000, num_training_steps = -1)



### VAR 5

In [None]:
for epoch in range(4):
    epoch_loss_train = 0
    model.train()
    for i, batch in enumerate(tqdm(dataloaders['train'])):  
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=masks)
        
        loss = outputs[0]  

        batch_loss = loss.item()
        epoch_loss_train += batch_loss

        loss.backward()

        optimizer.step()

        scheduler.step() 
    
    model.eval()
    epoch_loss_val = 0

    for batch in tqdm(dataloaders['val']):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels, attention_mask=masks)
        
            loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_val += batch_loss 

    print('Average train loss: {}'.format(epoch_loss_train/len(dataloaders['train'])))
    print('Average val loss: {}'.format(epoch_loss_val/len(dataloaders['val'])))
    torch.save(model.state_dict(), '/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v5.h5')

  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 4.020690818776613
Average val loss: 2.39251850938797


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 2.612536357707348
Average val loss: 2.7773936430215835


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 3.3992729108372908
Average val loss: 3.262973517179489


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 5.723382652610138
Average val loss: 10.556561175346374


In [None]:
state_dict = torch.load('/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v5.h5')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
model.eval()
generated = torch.tensor(tokenizer.encode('<SOS>')).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 768,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True).replace('\n',' ')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0:  to you you you Because you Because Because Because you you you Because you you you you you you you you you you you you you you you you Because Because you Because you you you Because Because Because Because Because Because you Because you Because Because Because Because Because Because Because you Because you you Because Because you Because Because Because Because Because Because Because you Because you Because you Because Because Because Because you you Because Because Because Because Because you you Because Because Because Because Because Because Because you you Because Because you Because Because Because Because you Because Because Because you you Because Because Because Because Because you you Because Because Because Because you Because you Because Because Because you Because you Because you Because Because Because you Because Because you Because Because you Because you Because you you Because you Because Because you Because Because you you yo

### VAR 4

In [None]:
for epoch in range(4):
    epoch_loss_train = 0
    model.train()
    for i, batch in enumerate(tqdm(dataloaders['train'])):  
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=masks)
        
        loss = outputs[0]  

        batch_loss = loss.item()
        epoch_loss_train += batch_loss

        loss.backward()

        optimizer.step()

        scheduler.step() 
    
    model.eval()
    epoch_loss_val = 0

    for batch in tqdm(dataloaders['val']):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels, attention_mask=masks)
        
            loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_val += batch_loss 

    print('Average train loss: {}'.format(epoch_loss_train/len(dataloaders['train'])))
    print('Average val loss: {}'.format(epoch_loss_val/len(dataloaders['val'])))
    torch.save(model.state_dict(), '/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v4.h5')

  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 7.099418396445765
Average val loss: 2.3528895798921585


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 2.3714976559603946
Average val loss: 2.3507899775505066


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 2.298494251548261
Average val loss: 2.247171772956848


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 2.228629337481294
Average val loss: 2.2353482877016067


In [None]:
state_dict = torch.load('/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v4.h5')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
model.eval()
generated = torch.tensor(tokenizer.encode('<SOS>')).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 768,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True).replace('\n',' ')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0:  I my a other I a will the most is the new in my wife. I is, I could he all man at a man of just can is a you to beI man of it are we is it will not is a was, not " The was that is not time " ". he, the just in the new way in to you have get the is " I is she, not to to not are the difference for, to not is, the way he are this is he am to two do the are the are not know an a car is this was is your was all is the father, the have the are is are " a are see is not other is not will not was the new I is know a the have. omeromers The I A A is and A I I A was and to will all are's


1:  What difference, has the a would it do the is a you, a little one one you a I he you " " that you is the woman. " The very has just is and it you know.It is he.I had, a is " and his time just I be not that and is he have for the first is you not not do to " " ". lexics P are,. extremist isb's Security fats A was! checkout and A I is


2:  What is a I a was a am it has

### VAR 3

In [None]:
for epoch in range(3):
    epoch_loss_train = 0
    model.train()
    for i, batch in enumerate(tqdm(dataloaders['train'])):  
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=masks)
        
        loss = outputs[0]  

        batch_loss = loss.item()
        epoch_loss_train += batch_loss

        loss.backward()

        optimizer.step()

        scheduler.step() 
    
    model.eval()
    epoch_loss_val = 0

    for batch in tqdm(dataloaders['val']):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels, attention_mask=masks)
        
            loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_val += batch_loss 

    print('Average train loss: {}'.format(epoch_loss_train/len(dataloaders['train'])))
    print('Average val loss: {}'.format(epoch_loss_val/len(dataloaders['val'])))
    torch.save(model.state_dict(), '/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v3.h5')

  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 0.9576396048906771
Average val loss: 0.4896374392658472


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 0.6106684786592913
Average val loss: 0.5452208913862705


  0%|          | 0/1334 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 0.8274211347594611
Average val loss: 0.7925609559118748


In [None]:
state_dict = torch.load('/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v3.h5')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
model.eval()
generated = torch.tensor(tokenizer.encode('<SOS>')).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 768,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True).replace('\n',' ')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0:  the " you he " he " " gets " a got the the way it the it he was they the he " the my the can pe the " no was " can a he you the are was knows too it a a both next woman the both both the he both it the the the " the were some woman the the " he woman can the the the he they a you the the a me next the both can have both d he woman he the the man he the it the you " the the the the the not " both he the the woman was the the he the you woman " the the woman the the man the " the he both the no no the no the was spot you a both no the woman he bit d he can the theic the the t he a the a very " he the can he the me it both knows you the you pe very t no are very way the he can the he a gets he the way the can woman can it the the can the was some woman spot the the " it he the it the you woman the a the man he the spot the gets he gets we three way the the he no he both not the woman he method spot " both d " man d woman can he the way you the can th

### VAR 2

In [None]:
for epoch in range(2):
    epoch_loss_train = 0
    model.train()
    for i, batch in enumerate(tqdm(dataloaders['train'])):  
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids, labels=labels, attention_mask=masks)
        
        loss = outputs[0]  

        batch_loss = loss.item()
        epoch_loss_train += batch_loss

        loss.backward()

        optimizer.step()

        scheduler.step() 
    
    model.eval()
    epoch_loss_val = 0

    for batch in tqdm(dataloaders['val']):
        input_ids = batch[0].to(device)
        labels = batch[0].to(device)
        masks = batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids, labels=labels, attention_mask=masks)
        
            loss = outputs[0]

        batch_loss = loss.item()
        epoch_loss_val += batch_loss 

    print('Average train loss: {}'.format(epoch_loss_train/len(dataloaders['train'])))
    print('Average val loss: {}'.format(epoch_loss_val/len(dataloaders['val'])))
    torch.save(model.state_dict(), '/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v2.h5')

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 0.8849961449988186
Average val loss: 0.4959643574357033


  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Average train loss: 0.7763160339184105
Average val loss: 0.709457398712635


In [None]:
state_dict = torch.load('/content/drive/My Drive/NLP_humor/models/GPT-2_jokes_v2.h5')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
model.eval()
generated = torch.tensor(tokenizer.encode('today')).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 768,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True).replace('\n',' ')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[40838]], device='cuda:0')
0: today. the I on The, for I the. have the. I. for with, with my, have.. have, The I I for,,,.,. have, man., show for and the.,,, for,.  have both, have both,,, the have have for,. with I,, have., an., show for for, The with with, 


1: today 


2: today section. for,. with with with  both.,,,,, I man. just, for for have. the both The. The. for I have with,,. with for The the The,, an both.. I, show made with with. an for my. for for,,, with., for, for, have I. I,, for have, the just. for, with I,. an,. and the, I for for have. on, the The.. for The..,.UR




In [None]:
model.eval()
generated = torch.tensor(tokenizer.encode('<SOS>')).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 768,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True).replace('\n',' ')))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0:  a for the the., The the., the a. for just for,, they with, for,. I,.,, on not on I just I in., for I for., the the with The for the, to the for for The, for I for The for,. for.,., The I, the have for The, the,.,,,, have. The for,. the, I with. for., my for, the., in The. on with have the for for. have,,, for for with for, the with,, the the, The.. for.UR The with,. for for the. an I, I. for., for The with the. I for for., I, for for, I,, The. an., for.,,,, for an. the for with,, with for..,,, for The with an. long. both for for The for I for., The, The for,,,. forUR. The for.... The for The.. the have, on,, have for. the, for.. The,, for the with.. in, the for a. to, an I the,, for with., for the, the I, for for,,, for for I 


1:  What the I I., for,, with for, with with. The for for, The. have. an,, with The have The, for,, have,. have,, The for not not my. have the,. with The for the have., The for The have for for. for on, I.,, have. with for