In [1]:
import os
import json
import random
from glob import glob
import re
import numpy as np
import pandas as pd
import transformers

qa_jokes_filepath = os.path.join('..', 'data', 'qa_jokes.csv')
short_jokes_filepath = os.path.join('..', 'data', 'short_jokes.csv')
transcripts_path = os.path.join('..', 'data', 'transcripts')

qa_jokes_prep_outpath = os.path.join('..', 'data', 'prep', 'qa_jokes_gpt2.txt')
short_jokes_prep_outpath = os.path.join('..', 'data', 'prep', 'short_jokes_gpt2.txt')
transcripts_prep_outpath = os.path.join('..', 'data', 'prep', 'transcripts_gpt2.txt')

## Preprocess data

In [7]:
def fix_encoding(s):
    """Skip characters that can't be encoded by standard encoder."""
    return s.encode('latin1', 'ignore').decode('utf8', 'ignore')

def write_to_file(file_path, text):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w') as out_file:
        out_file.write(text)

START_DOC_TOKEN = ''
END_DOC_TOKEN = '<|endoftext|>'

### QA Jokes
Taken from [here](https://www.kaggle.com/jiriroz/qa-jokes).

Cleaned from noisy/non-represantable data. (Notes, already inserted "Q"/"A" tags)

In [3]:
qa_jokes = pd.read_csv(qa_jokes_filepath)
qa_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38234 entries, 0 to 38233
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        38234 non-null  int64 
 1   Question  38234 non-null  object
 2   Answer    38234 non-null  object
dtypes: int64(1), object(2)
memory usage: 896.2+ KB


In [4]:
qa_corpus = ''
for _, question, answer in qa_jokes.values:
    qa_corpus += fix_encoding(f'{START_DOC_TOKEN}[QUESTION] {question}\n[ANSWER] {answer}\n{END_DOC_TOKEN}\n')

write_to_file(qa_jokes_prep_outpath, qa_corpus)

### Transcripts
Scrapped dataset of stand up's transcripts from [scrapsfromtheloft.com](scrapsfromtheloft.com).

In [5]:
transcript_corpus = ''
# Load transcripts.
for file_path in glob(os.path.join(transcripts_path, '*')):
    with open(file_path, 'r', encoding='utf8') as in_file:
        transcript_corpus += START_DOC_TOKEN + ''.join(in_file.read()) + END_DOC_TOKEN + '\n'

transcript_corpus = fix_encoding(fix_encoding(transcript_corpus))

# Save all them as dataset.
write_to_file(transcripts_prep_outpath, transcript_corpus)

### Short Jokes
Dataset taken from [here](https://www.kaggle.com/abhinavmoudgil95/short-jokes).

Also cleaned up. (Twitter tags, f@ck/@sshole words, samples with link to smth)

In [6]:
short_jokes = pd.read_csv(short_jokes_filepath)
short_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230975 entries, 0 to 230974
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      230975 non-null  int64 
 1   Joke    230975 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


In [7]:
# Find QA jokes in short jokes
from nltk.tokenize import sent_tokenize
qa_jokes_in_short_jokes = []
for i, (_, joke) in enumerate(short_jokes.values):
    sentences = sent_tokenize(joke.strip())
    if len(sentences) < 4 and len(sentences) > 1 and sentences[0][-1] == '?':
        qa_jokes_in_short_jokes.append(i)

In [8]:
# Show random one
ind = np.random.randint(len(qa_jokes_in_short_jokes))
short_jokes.iloc[qa_jokes_in_short_jokes[ind]].Joke

'What do you call a black man walking down the street? A pedestrian.'

In [9]:
# Add them to qa jokes
for i, joke in short_jokes.iloc[qa_jokes_in_short_jokes].values:
    sentences = sent_tokenize(joke.strip())
    question, answer = sentences[0], ' '.join(sentences[1:])
    qa_corpus += fix_encoding(f'{START_DOC_TOKEN}[QUESTION] {question}\n[ANSWER] {answer}\n{END_DOC_TOKEN}\n')

write_to_file(qa_jokes_prep_outpath, qa_corpus)

In [10]:
# Delete found qa in short
short_jokes = short_jokes.drop(qa_jokes_in_short_jokes)

In [11]:
short_jokes_corpus = ''
for i, joke in short_jokes.values:
    short_jokes_corpus += fix_encoding(f'{START_DOC_TOKEN}{joke.strip()}\n{END_DOC_TOKEN}\n')


write_to_file(short_jokes_prep_outpath, short_jokes_corpus)

## Train

In [28]:
def create_cmd_command(python_path, script, kwargs, flags):
    args = ' '.join(f'--{k}={v}' for k, v in kwargs.items())
    args += ' ' + ' '.join(f'--{f}' for f in flags)
    return f'{python_path} {script} {args}'

python_path = r'C:\Users\Alex\Anaconda3\envs\pytorch\python.exe'
script = r'run_language_modeling.py'
train_kwargs = {
    'model_type': 'gpt2', # gpt2, ctrl, openai-gpt, xlnet, transfo-xl, xlm
    'model_name_or_path':'gpt2',
    'output_dir':'output',
    'block_size': 512,
    'learning_rate': 1e-6,
    'num_train_epochs': 3,
    'per_gpu_train_batch_size': 2,
    'gradient_accumulation_steps': 4,
    'save_steps': 1000,
#     'max_steps': 20000,
}

# set CUDA_VISIBLE_DEVICES=1

train_outputs = [
    'gpt2',
    'output',   # Transcripts 1e-5, 3
    'output_1', # Transcripts 1e-6, 1
    'output_2', # short_jokes 1e-5, 2
    'output_3', # short_jokes 1e-6, 2
    'output_4', # qa_jokes    1e-6, 3
    'output_5', # qa_jokes    1e-5, 2
    'output_6', # qa_jokes    1e-6, 4
    'output_7', # qa_jokes    1e-6, 3 grad_acc 4
    
]

train_flags = [
    'do_train',
#     'overwrite_output_dir',
#     'fp16',
]

In [29]:
# train_kwargs['train_data_file'] = transcripts_prep_outpath
# train_kwargs['train_data_file'] = short_jokes_prep_outpath
train_kwargs['train_data_file'] = qa_jokes_prep_outpath

In [30]:
train_kwargs['model_name_or_path'] = train_outputs[7]
train_kwargs['output_dir'] = train_outputs[8]

In [31]:
cmd_command = create_cmd_command(python_path, script, train_kwargs, train_flags)
print(cmd_command)

C:\Users\Alex\Anaconda3\envs\pytorch\python.exe run_language_modeling.py --model_type=gpt2 --model_name_or_path=output_6 --output_dir=output_7 --block_size=512 --learning_rate=1e-06 --num_train_epochs=3 --per_gpu_train_batch_size=2 --gradient_accumulation_steps=4 --save_steps=1000 --train_data_file=..\data\prep\qa_jokes_gpt2.txt --do_train


### Generate

In [44]:
gen_script = r'run_generation.py'
generate_kwargs = {
    'model_type': train_kwargs['model_type'],
    'model_name_or_path': train_kwargs['output_dir'],
    'prompt': rf'"{START_DOC_TOKEN}[QUESTION]"',
#     'prompt': '"The reddit enters a bar"',
    'length': 100,
    'stop_token': f'"{END_DOC_TOKEN}"',
    'temperature': 0.9, # temperature of 1.0 has no effect, lower tend toward greedy sampling
#     'repetition_penalty': 1.1, # primarily useful for CTRL model; in that case, use 1.2
    'k': 50,
    'p': 0.95,
#     'padding_text': '', # Padding text for Transfo-XL and XLNet.
    'num_return_sequences':20,
}
gen_flags = []

cmd_command = create_cmd_command(python_path, gen_script, generate_kwargs, gen_flags)
print(cmd_command)

C:\Users\Alex\Anaconda3\envs\pytorch\python.exe run_generation.py --model_type=gpt2 --model_name_or_path=output_7 --prompt="[QUESTION]" --length=100 --stop_token="<|endoftext|>" --temperature=0.9 --k=50 --p=0.95 --num_return_sequences=20 
