In [1]:
import json
from pathlib import Path
from tqdm import tqdm_notebook
import psutil

import random
import numpy as np
import pandas as pd

import os
import time
import datetime


import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

In [2]:
bos_token = '<|start|>'
ans_token = ' [ANSWER] '
eos_token = '<|end|>'
pad_token = '<|pad|>'

def generate_text(subs, comments):
    texts = []
    for sub, comment in zip(subs, comments):
        text = bos_token + sub + ans_token + comment + eos_token
        texts.append(text)
    return texts

In [3]:
%%time
df = pd.read_csv(os.path.join('data', 'joined_data_AskReddit.csv.gz'), compression = 'gzip', sep = ';',)

CPU times: user 37.3 s, sys: 1.73 s, total: 39 s
Wall time: 39 s


In [4]:
df.shape

(9423894, 14)

In [5]:
df.author_comment.value_counts()

AutoModerator         81947
throwawayohyesitis    16033
KatyLiedTheBitch       4597
Gundam336B             4362
Flashy_Gardener        4172
                      ...  
DwindlingGravitas         1
Slut4GhostStories         1
Pinfield357               1
IMainJannaxxx             1
DrJuli                    1
Name: author_comment, Length: 1233063, dtype: int64

In [6]:
df[df.author_comment == 'AutoModerator'].head()

Unnamed: 0,link_id,id_comment,author_comment,total_awards_received_comment,text_comment,permalink,score_comment,subreddit,author_submission,text_submission,id_submission,total_awards_received_submission,score_submission,num_comments
65156,t3_a1xult,eatlgg5,AutoModerator,,**Attention! [Serious] Tag Notice**\n\n* Jokes...,/r/AskReddit/comments/a1xult/serious_whats_a_g...,1.0,AskReddit,Brainiac03,[Serious] What's a good name for a travel blog?,a1xult,,1,4
66618,t3_a1xa9a,eatgqq0,AutoModerator,,**Attention! [Serious] Tag Notice**\n\n* Jokes...,/r/AskReddit/comments/a1xa9a/serious_redditors...,1.0,AskReddit,SMRNS2017,[Serious] Redditors who have been cheated on b...,a1xa9a,,3,8
70416,t3_a1xxsa,eatm68d,AutoModerator,,Hi there! Your post was removed because it use...,/r/AskReddit/comments/a1xxsa/have_yall_every_b...,1.0,AskReddit,damperfoot,"Have y'all every been friendzoned, be honest",a1xxsa,,1,1
109957,t3_a1xy7v,eatma1b,AutoModerator,,**PLEASE READ THIS MESSAGE IN ITS ENTIRETY BEF...,/r/AskReddit/comments/a1xy7v/worst_christmas_y...,2.0,AskReddit,Uatu_The_Watcher07,Worst Christmas you’ve ever had?,a1xy7v,,1,1
129503,t3_a1xysw,eatmexc,AutoModerator,,Hi there! Your post was removed because it use...,/r/AskReddit/comments/a1xysw/what_are_some_of_...,1.0,AskReddit,manutorrente,What are some of the funniest stories of your ...,a1xysw,,1,1


In [7]:
df = df[df.author_comment != 'AutoModerator']

In [8]:
df['score_rank'] = df.groupby('id_submission').score_comment.rank(ascending = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['score_rank'] = df.groupby('id_submission').score_comment.rank(ascending = False)


In [9]:
df[df.id_submission == 'a1x406']

Unnamed: 0,link_id,id_comment,author_comment,total_awards_received_comment,text_comment,permalink,score_comment,subreddit,author_submission,text_submission,id_submission,total_awards_received_submission,score_submission,num_comments,score_rank
0,t3_a1x406,eatm38u,DomIstKrieg,,Same. I would have missed some cool stuff.,/r/AskReddit/comments/a1x406/redditors_who_hav...,2.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,19.5
1,t3_a1x406,eatm4v8,dorydoop,,Wow. That incredible. Did you get to know the ...,/r/AskReddit/comments/a1x406/redditors_who_hav...,2.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,19.5
2,t3_a1x406,eatm5z0,dorydoop,,"So, are you doing better now?",/r/AskReddit/comments/a1x406/redditors_who_hav...,1.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,36.5
3,t3_a1x406,eatmdu8,TastyAssBiscuit,,"Oh gosh, not a straightforward or easy questio...",/r/AskReddit/comments/a1x406/redditors_who_hav...,2.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,19.5
4,t3_a1x406,eatmgip,rollerderbydyke,,Failed first attempt. Told myself I’d try agai...,/r/AskReddit/comments/a1x406/redditors_who_hav...,2.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,19.5
5,t3_a1x406,eatmi3t,DomIstKrieg,,"Everytime i get a suicidal thought i think ""We...",/r/AskReddit/comments/a1x406/redditors_who_hav...,2.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,19.5
6,t3_a1x406,eatmqmo,jimmyablow09,,When I was 23 I came home to find my wife with...,/r/AskReddit/comments/a1x406/redditors_who_hav...,3.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,4.5
7,t3_a1x406,eatmtg9,dorydoop,,The power of a pinky promise. I'm sure your fr...,/r/AskReddit/comments/a1x406/redditors_who_hav...,2.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,19.5
8,t3_a1x406,eatmwt9,dorydoop,,"So grateful for love at first sight, amirigt?",/r/AskReddit/comments/a1x406/redditors_who_hav...,2.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,19.5
9,t3_a1x406,eatmz54,SleepNowMyThrowaway,,I did - we ended up dating :),/r/AskReddit/comments/a1x406/redditors_who_hav...,3.0,AskReddit,dorydoop,"Redditors who have come close to suicide, what...",a1x406,,5,41,4.5


In [13]:
df = pd.merge(df, 
              df.groupby('id_submission').score_rank.min().reset_index().rename(
                  columns = {'score_rank': 'min_rank'}),
              how = 'left', on = 'id_submission')

In [15]:
df.head(2).T

Unnamed: 0,0,1
link_id,t3_a1x406,t3_a1x406
id_comment,eatm38u,eatm4v8
author_comment,DomIstKrieg,dorydoop
total_awards_received_comment,,
text_comment,Same. I would have missed some cool stuff.,Wow. That incredible. Did you get to know the ...
permalink,/r/AskReddit/comments/a1x406/redditors_who_hav...,/r/AskReddit/comments/a1x406/redditors_who_hav...
score_comment,2,2
subreddit,AskReddit,AskReddit
author_submission,dorydoop,dorydoop
text_submission,"Redditors who have come close to suicide, what...","Redditors who have come close to suicide, what..."


In [18]:
df = pd.merge(df.drop('min_rank', axis = 1), 
              df[df.score_rank > df.min_rank].groupby('id_submission').score_rank.min().reset_index().rename(
                  columns = {'score_rank': 'min_rank'}),
              how = 'left', on = 'id_submission')

In [20]:
tmp = df[df.score_rank <= df.min_rank]

In [21]:
tmp.shape

(1096538, 16)

In [22]:
texts = generate_text(tmp.text_submission, tmp.text_comment)

In [23]:
train_size = int(0.8 * len(texts))
test_size = len(texts) - train_size
train_dataset, test_dataset = random_split(texts, [train_size, test_size])

In [24]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2', 
                                          bos_token=bos_token, 
                                          eos_token=eos_token, 
                                          pad_token=pad_token,)

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [25]:
class GPT2Dataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length=768):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        for txt in tqdm_notebook(txt_list):
            encodings_dict = tokenizer(txt, 
                                       truncation=True, 
                                       max_length=max_length, 
                                       padding="max_length")

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            if psutil.virtual_memory().percent > 95:
                break
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

In [26]:
%%time
dataset = GPT2Dataset(train_dataset, tokenizer, max_length=1024)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for txt in tqdm_notebook(txt_list):


HBox(children=(FloatProgress(value=0.0, max=877230.0), HTML(value='')))


CPU times: user 4min 57s, sys: 13.5 s, total: 5min 11s
Wall time: 5min 10s


In [27]:
%%time
val_dataset = GPT2Dataset(test_dataset, tokenizer, max_length=1024)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for txt in tqdm_notebook(txt_list):


HBox(children=(FloatProgress(value=0.0, max=219308.0), HTML(value='')))


CPU times: user 1min 12s, sys: 3.3 s, total: 1min 15s
Wall time: 1min 15s


In [28]:
batch_size = 2

train_dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [29]:
model = GPT2LMHeadModel.from_pretrained("distilgpt2",)

In [30]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [31]:
device = torch.device("cuda")
model.to(device)

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [32]:
epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

sample_every = 20000

optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [33]:
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
total_t0 = time.time()

training_stats = []


for epoch_i in range(0, epochs):
    #               Training
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in tqdm_notebook(enumerate(train_dataloader)):
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                        labels = b_labels, 
                        attention_mask = b_masks, 
                        token_type_ids = None)

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(
                step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                bos_token_id=random.randint(1,30000),
                do_sample=True,   
                top_k=50, 
                max_length = 200,
                top_p=0.95, 
                num_return_sequences=1
            )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()
        optimizer.step()
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in tqdm_notebook(validation_dataloader):
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    torch.save(model.state_dict(), 'distilgpt2_mln1train_ep' + str(epoch_i + 1) + '.ph')

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for step, batch in tqdm_notebook(enumerate(train_dataloader)):


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 20,000  of  438,615. Loss: 0.1842157244682312.   Elapsed: 1:21:47.
0:  bipartisanWhat are some of the best way to make money? [ANSWER] Make a difference and take a loan for it 


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 40,000  of  438,615. Loss: 0.09464314579963684.   Elapsed: 2:43:23.
0:  increasingWhats your favorite food? [ANSWER] Pussy. You can try the same thing for yourself, and the general idea is too hard to get it.


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 60,000  of  438,615. Loss: 0.15913140773773193.   Elapsed: 4:04:42.
0: dayWhat is one thing you would wish your parents would do differently? [ANSWER] Ask someone why people don't like that and then I wish that I took them and all of my time.


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 80,000  of  438,615. Loss: 0.23847395181655884.   Elapsed: 5:26:30.
0:  HangWhat is your favorite Christmas gift? [ANSWER] A small house and a set of shoes with a bottle of paper, a small ring


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 100,000  of  438,615. Loss: 0.09185270965099335.   Elapsed: 6:48:37.
0:  foods[Serious] What are some things that you regret NOT doing in public everyday everyday life? [ANSWER] My last four years of college in a different state. My parents didn't work well because I was a kid. I was so busy with my kids who were doing my own job. While I had kids on call and a manager I thought I could turn me into a employee and tell me I was really interested in my job because I knew what I wanted to do. I was in this position when I was 13 and got a break up where we had other friends and my family decided to take my kids so I don't want to get a different job. I realized that I didn't have to choose to do that stuff and there's nothing going on with anyone in my life, but there's nothing wrong and I don't want to be doing anything. They're probably just like people.


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 120,000  of  438,615. Loss: 0.1633497029542923.   Elapsed: 8:11:01.
0:  trailWhat's the dumbest thing you've ever seen on youtube that you've seen? [ANSWER] The [Loo -](https://en.youtube.com/r/dN2cYzYcQ-M-Bq-Tz-mBw2-Btw���������������������������������������������������������������������������


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 140,000  of  438,615. Loss: 0.182831808924675.   Elapsed: 9:33:30.
0: intendHow did your favourite professor react when your teacher had failed an exam, but you had to tell them how to do? [ANSWER] If the teacher wanted to do something, it wouldn't hurt the teacher.


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 160,000  of  438,615. Loss: 0.5286455750465393.   Elapsed: 10:56:02.
0:  surroundWhat movie do you just love and appreciate despite being a failure in the end? [ANSWER] The ending. The one that got me really really into it.


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 180,000  of  438,615. Loss: 0.13171571493148804.   Elapsed: 12:18:34.
0:  reflexWhat is your favorite TV show and why? [ANSWER] Breaking Bad is great, its worth watching!


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 200,000  of  438,615. Loss: 0.15762880444526672.   Elapsed: 13:41:06.
0:  displayHave you ever been on reddit like this? What did it look like? [ANSWER] It was a post history. The comments were both political views.

https://en.m.wikipedia.org/wiki/Jokes


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 220,000  of  438,615. Loss: 0.12110526114702225.   Elapsed: 15:03:40.
0:  pastorTo those of you who suffered more than 5-10 weeks ago, how did you get over it and how did you find the change? [ANSWER] I worked with a manager, we had his way to study at college, so he worked there! He got through the course of my exam, and told me to just study his homework before we could find his place again.

We didn't realize he was working for 4 years, he just asked us where to put some papers from so our project could go out, he had to give us the test and he didn't want us to work so the last 5 years you can get there he still tells us all about it, but only that after you have it, he says he has no idea what he was thinking and was thinking, like the one who knew had a job I was using. So there's him asking how he could go back to class. Then after I had to sit, he tells us he wanted to


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 240,000  of  438,615. Loss: 0.1776961386203766.   Elapsed: 16:26:15.
0:  illicitPeople who have been to the theatre, what was it like? [ANSWER] You didn't get me wrong. I was on a seat in the theatre a couple years back in the early 1800s. 


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 260,000  of  438,615. Loss: 0.22357314825057983.   Elapsed: 17:48:47.
0:  LiberationFor those who went to school with their teachers, how was it like? Did you survive? How did it go? [ANSWER] My dad's parents and my grandmother had to give away the house and get out on the corner of my apartment for over 2 years. I was at a church camp and had a friend with a group of a guy who's a good friend with a girl who doesn't remember anything since they went home with his kids. The only issue is that he was the one and wasn't allowed to do anything and never wanted to try to make any more again. I've never had a friend with my mother anymore since. Never said I wanted to see him again. 

I was probably 15 and was so tired I couldn't imagine it in another room, but it wasn't a dream. I didn't know that her friend was dead because I had never heard the story, but I saw her, who was still alive. That was the dream.


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 280,000  of  438,615. Loss: 0.08846564590930939.   Elapsed: 19:11:11.
0:  NamWho's the next celebrity that you wish you could date? [ANSWER] Danny Devito for sure 


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 300,000  of  438,615. Loss: 0.13374905288219452.   Elapsed: 20:32:46.
0: IONWhat fictional characters would be actually really boring? [ANSWER] Oh yeah. Are you joking?


Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


  Batch 320,000  of  438,615. Loss: 0.1083112359046936.   Elapsed: 21:54:58.
0:  glimpse[Serious] What’s a story that you have a hidden meaning for? [ANSWER] It’s not the first time, just the one time, but the last minute in the last year I was in love with my girlfriend, and she was at her friend’s house at the time. I was like, “hey man, you’re doing fine?” I say. She starts talking my girlfriend and then talking to me, just a couple months later. She comes to me before she is in a relationship. She’s also the opposite of me, and also the opposite of her brother/ sister. She’s also the perfect best at the same time, so she has a very special place. I’m just never close to anyone or anything like that. She’s a very big fan of my country but is always on their own.


In [231]:
sample = 'Which song gives you goosebumps?'
sample = bos_token + sample + ans_token
encodings_dict = tokenizer(sample, truncation=True, max_length=1024, )
input_ids = torch.tensor(encodings_dict['input_ids']).to(device)
greedy_output = model.generate(input_ids.unsqueeze(0),                             
                               max_length = 140,
                               num_beams = 1,
                               no_repeat_ngram_size = 2, 
                               num_return_sequences = 1,
                               early_stopping=True)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [232]:
for i, beam_output in enumerate(greedy_output):
    print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))
    print()

0: Which song gives you goosebumps? [ANSWER] 

I'm not sure if this counts but I've heard it a few times. I'm sure it's a good song.] The song "The Night Before" by The Night before the end of the night. ] I think it was a really good album. It's just a beautiful song, but it has a lot of songs that I can't really explain.
 fortune is a very good one. The lyrics are a bit too much. But it is so beautiful. And it makes me feel like I have a great time. So I feel so good about it.





In [229]:
sample = "What was a dumb belief you had as a kid?"
sample = bos_token + sample + ans_token
encodings_dict = tokenizer(sample, truncation=True, max_length=1024, )
input_ids = torch.tensor(encodings_dict['input_ids']).to(device)
greedy_output = model.generate(input_ids.unsqueeze(0),                             
                               max_length = 60,
                               num_beams = 10,
                               no_repeat_ngram_size = 2, 
                               num_return_sequences = 3,
                               early_stopping=True)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


In [230]:
for i, beam_output in enumerate(greedy_output):
    print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))
    print()

0: What was a dumb belief you had as a kid? [ANSWER]  “I’m an adult.” 

I used to believe that when I was younger, I thought it would be a good idea to have a baby.  



1: What was a dumb belief you had as a kid? [ANSWER]  “I’m an adult.” 

I used to believe that when I was younger, I thought it would be a good idea to have a child.  



2: What was a dumb belief you had as a kid? [ANSWER]  “I’m an adult.” 

I used to believe that when I was younger, I thought it would be a good idea to have a child.  


It was

