In [5]:
import os
import json
import random
from glob import glob
import re
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import transformers

# Eng

qa_jokes_filepath = os.path.join('..', 'data', 'qa_jokes.csv')
short_jokes_filepath = os.path.join('..', 'data', 'short_jokes.csv')
transcripts_path = os.path.join('..', 'data', 'transcripts')

qa_jokes_prep_outpath = os.path.join('..', 'data', 'prep', 'qa_jokes_gpt2.txt')
short_jokes_prep_outpath = os.path.join('..', 'data', 'prep', 'short_jokes_gpt2.txt')
transcripts_prep_outpath = os.path.join('..', 'data', 'prep', 'transcripts_gpt2.txt')

# Rus

rus_qa_jokes_filepath = os.path.join('..', 'data', 'rus_qa_jokes.csv')
rus_jokes_filepath = os.path.join('..', 'data', 'rus_jokes.csv')
rus_stories_filepath = os.path.join('..', 'data', 'anekdot_stories.csv')


rus_qa_jokes_prep_outpath = os.path.join('..', 'data', 'prep', 'rus_qa_jokes_gpt2.txt')
rus_jokes_prep_outpath = os.path.join('..', 'data', 'prep', 'rus_jokes_gpt2.txt')
rus_stories_prep_outpath = os.path.join('..', 'data', 'prep', 'rus_stories_gpt2.txt')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Preprocess data

In [6]:
def fix_encoding(s):
    """Skip characters that can't be encoded by standard encoder."""
    return s.encode('utf-8', 'ignore').decode('utf8', 'ignore')

# TODO: Add &amp;nbsp;  &gt;  &lt;
regexps = [ # Regexp for the special chars
    (re.compile('♦'), '*'),
    (re.compile('\n *\n'), '\n'), # Replace multiple newlines with one
    (re.compile(r' {2,}'), ' '), # Replace multiple spaces with one
]

def fix_text(s):
    for regexp in regexps:
        s = regexp[0].sub(regexp[1], s)
    return fix_encoding(s.strip())

def write_to_file(file_path, text, encoding=None):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, 'w', encoding=encoding) as out_file:
        out_file.write(text)

START_DOC_TOKEN = ''
END_DOC_TOKEN = '<|endoftext|>'

### QA Jokes
Taken from [here](https://www.kaggle.com/jiriroz/qa-jokes).

Cleaned from noisy/non-represantable data. (Notes, already inserted "Q"/"A" tags)

In [3]:
qa_jokes = pd.read_csv(qa_jokes_filepath)
qa_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38232 entries, 0 to 38231
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        38232 non-null  int64 
 1   Question  38232 non-null  object
 2   Answer    38232 non-null  object
dtypes: int64(1), object(2)
memory usage: 896.2+ KB


In [63]:
qa_corpus = []
for _, question, answer in qa_jokes.values:
    qa_corpus.append(f'{START_DOC_TOKEN}[QUESTION] {question}\n[ANSWER] {answer}\n{END_DOC_TOKEN}')

qa_corpus = '\n'.join(map(lambda s: fix_text(s), qa_corpus))

write_to_file(qa_jokes_prep_outpath, qa_corpus, encoding='utf-8')

### Transcripts
Scrapped dataset of stand up's transcripts from [scrapsfromtheloft.com](scrapsfromtheloft.com).

In [65]:
transcript_corpus = []
# Load transcripts.
for file_path in glob(os.path.join(transcripts_path, '*')):
    with open(file_path, 'r', encoding='utf8') as in_file:
        if len(re.findall(r'html|http|jpe?g|png|mp4', text)) > 0:
            print('Has http|html in it:', f_path)
        transcript_corpus.append(START_DOC_TOKEN + ''.join(in_file.read()) + END_DOC_TOKEN)

transcript_corpus = '\n'.join(map(lambda s: fix_text(s), transcript_corpus))

# Save all them as dataset.
write_to_file(transcripts_prep_outpath, transcript_corpus, encoding='utf-8')

### Short Jokes
Dataset taken from [here](https://www.kaggle.com/abhinavmoudgil95/short-jokes).

Also cleaned up. (Twitter tags, f@ck/@sshole words, samples with link to smth)

In [4]:
short_jokes = pd.read_csv(short_jokes_filepath)
short_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230974 entries, 0 to 230973
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ID      230974 non-null  int64 
 1   Joke    230974 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


In [5]:
# Find QA jokes in short jokes
from nltk.tokenize import sent_tokenize
qa_jokes_in_short_jokes = []
for i, (_, joke) in enumerate(short_jokes.values):
    sentences = sent_tokenize(joke.strip())
    if len(sentences) < 4 and len(sentences) > 1 and sentences[0][-1] == '?':
        qa_jokes_in_short_jokes.append(i)

In [6]:
# Show random one
ind = np.random.randint(len(qa_jokes_in_short_jokes))
short_jokes.iloc[qa_jokes_in_short_jokes[ind]].Joke

'What do you call a Muslim organization that rejects Muhammed? A non-prophet'

In [None]:
jokes = {'Question': [], 'Answer': []}
for i, joke in short_jokes.iloc[qa_jokes_in_short_jokes[ind]].values:
    sentences = sent_tokenize(joke.strip())
    question, answer = sentences[0], ' '.join(sentences[1:])
    jokes['']

In [71]:
# Add them to qa jokes
for i, joke in short_jokes.iloc[qa_jokes_in_short_jokes].values:
    sentences = sent_tokenize(joke.strip())
    question, answer = sentences[0], ' '.join(sentences[1:])
    qa_corpus += fix_encoding(f'{START_DOC_TOKEN}[QUESTION] {question}\n[ANSWER] {answer}\n{END_DOC_TOKEN}\n')

write_to_file(qa_jokes_prep_outpath, qa_corpus)

In [72]:
# Delete found qa in short
short_jokes = short_jokes.drop(qa_jokes_in_short_jokes)

In [73]:
short_jokes_corpus = ''
for i, joke in short_jokes.values:
    short_jokes_corpus += fix_encoding(f'{START_DOC_TOKEN}{joke.strip()}\n{END_DOC_TOKEN}\n')


write_to_file(short_jokes_prep_outpath, short_jokes_corpus)

## Russian Stories

In [149]:
rus_stories = pd.read_csv(rus_stories_filepath, index_col=0)
rus_stories.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110529 entries, 0 to 110528
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Text    110529 non-null  object
dtypes: object(1)
memory usage: 1.7+ MB


In [155]:
rus_stories_corpus = '\n'.join(map(lambda s: START_DOC_TOKEN + fix_text(s[0]) + END_DOC_TOKEN, rus_stories.values))

# Save all them as dataset.
write_to_file(rus_stories_prep_outpath, rus_stories_corpus, encoding='utf-8')

## Russian Jokes

In [151]:
rus_jokes = pd.read_csv(rus_jokes_filepath, index_col=0)
rus_jokes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 439057 entries, 0 to 439056
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Text    439057 non-null  object
dtypes: object(1)
memory usage: 6.7+ MB


In [152]:
rus_jokes

Unnamed: 0,Text
0,Один хороший анекдот — это дополнительные 15 м...
1,Старик и старуха в суде. Судья: — Почему разво...
2,"Х♦♦ — железо, пока горячо."
3,"— Сколько нужно вагонов, чтобы вывезти всех де..."
4,Женщина: в 20 лет — лепестки розы в 30 лет — с...
...,...
439052,"Вот было бы так: поел, завалился на диван, сп..."
439053,Мишустин доездился без пропуска - вот и заболе...
439054,Напоследок Мишустин из больницы пообещал сдела...
439055,План следующего выступления: 1.Ситуация с коро...


In [156]:
rus_jokes_corpus = '\n'.join(map(lambda s: START_DOC_TOKEN + fix_text(s[0]) + END_DOC_TOKEN, rus_jokes.values))


# # Save all them as dataset.
write_to_file(rus_jokes_prep_outpath, rus_jokes_corpus, encoding='utf-8')

### Russian QA Jokes

In [135]:
rus_qa_jokes = pd.read_csv(rus_qa_jokes_filepath)
rus_qa_jokes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67563 entries, 0 to 67562
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  67563 non-null  int64 
 1   Question    67563 non-null  object
 2   Answer      67563 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.5+ MB


In [136]:
rus_qa_corpus = []
for _, question, answer in rus_qa_jokes.values:
    rus_qa_corpus.append(fix_text(f'{START_DOC_TOKEN}[ ВОПРОС] {question}\n[ ОТВЕТ] {answer}\n{END_DOC_TOKEN}'))

rus_qa_corpus = '\n'.join(rus_qa_corpus)

write_to_file(rus_qa_jokes_prep_outpath, rus_qa_corpus, encoding='utf-8')

## Train

In [1]:
def create_cmd_command(python_path, script, kwargs, flags):
    args = ' '.join(f'--{k}={v}' for k, v in kwargs.items())
    args += ' ' + ' '.join(f'--{f}' for f in flags)
    return f'{python_path} {script} {args}'

In [17]:
python_path = r'C:\Users\Alex\Anaconda3\envs\pytorch\python.exe'
script = r'run_lm_finetuning.py'
train_kwargs = {
    'model_type': 'gpt2', # gpt2, ctrl, openai-gpt, xlnet, transfo-xl, xlm
    'model_name_or_path':'gpt2',
    'output_dir':'output',
    'block_size': 512,
    'learning_rate': 1e-6,
    'num_train_epochs': 3,
    'per_gpu_train_batch_size': 2,
    'gradient_accumulation_steps': 8,
    'save_steps': 1000,
#     'max_steps': 20000,
}

# set CUDA_VISIBLE_DEVICES=1

train_outputs = [
    'gpt2',
    'output',   # Transcripts 1e-6, 5 grad_acc 4
    'output_1', # Transcripts 1e-5, 2 grad_acc 8
    'output_2', # short_jokes 1e-5, 2 grad_acc 8
    'output_3', # short_jokes 1e-6, 5 grad_acc 8
    'output_4', # short_jokes 1e-7, 2 grad_acc 2
    'output_5', # qa_jokes    1e-5, 3 grad_acc 8 - most funny yet
    'output_6', # qa_jokes    1e-5, 3 grad_acc 4
    'output_7', # qa_jokes    1e-6, 2 grad_acc 2
    'output_8', # qa_jokes    1e-6, 10 grad_acc 8
    
]

train_flags = [
    'do_train',
    'overwrite_output_dir',
#     'fp16',
]

In [18]:
# train_kwargs['train_data_file'] = transcripts_prep_outpath
# train_kwargs['train_data_file'] = short_jokes_prep_outpath
train_kwargs['train_data_file'] = qa_jokes_prep_outpath

In [19]:
train_kwargs['model_name_or_path'] = train_outputs[0]
# train_kwargs['model_name_or_path'] = os.path.join('models', train_outputs[0])
train_kwargs['output_dir'] = os.path.join('models', train_outputs[1])
print('From:', train_kwargs['model_name_or_path'], '\nTo:', train_kwargs['output_dir'])

From: gpt2 
To: models\output


In [20]:
cmd_command = create_cmd_command(python_path, script, train_kwargs, train_flags)
print(cmd_command)

C:\Users\Alex\Anaconda3\envs\pytorch\python.exe ru_transformers-master\run_lm_finetuning.py --model_type=gpt2 --model_name_or_path=gpt2 --output_dir=models\output --block_size=512 --learning_rate=1e-06 --num_train_epochs=3 --per_gpu_train_batch_size=2 --gradient_accumulation_steps=8 --save_steps=1000 --train_data_file=..\data\prep\qa_jokes_gpt2.txt --do_train --overwrite_output_dir


### Generate

In [132]:
gen_script = r'run_generation.py'
generate_kwargs = {
    'model_type': train_kwargs['model_type'],
    'model_name_or_path': train_kwargs['output_dir'],
    'prompt': rf'"{START_DOC_TOKEN}[QUESTION]"',
#     'prompt': '"The reddit enters a bar"',
    'length': 100,
    'stop_token': f'"{END_DOC_TOKEN}"',
    'temperature': 0.9, # temperature of 1.0 has no effect, lower tend toward greedy sampling
    'repetition_penalty': 1.05, # primarily useful for CTRL model; in that case, use 1.2
    'k': 50,
    'p': 0.95,
#     'padding_text': '', # Padding text for Transfo-XL and XLNet.
    'num_return_sequences':40,
}
gen_flags = []

cmd_command = create_cmd_command(python_path, gen_script, generate_kwargs, gen_flags)
print(cmd_command)

C:\Users\Alex\Anaconda3\envs\pytorch\python.exe run_generation.py --model_type=gpt2 --model_name_or_path=models\rus_2 --prompt="[QUESTION]" --length=100 --stop_token="<|endoftext|>" --temperature=0.9 --repetition_penalty=1.05 --k=50 --p=0.95 --num_return_sequences=40 


## ru_transformers
https://github.com/mgrankin/ru_transformers

In [177]:
python_path = r'C:\Users\Alex\Anaconda3\envs\pytorch\python.exe'
script = r'run_lm_finetuning.py'
train_kwargs = {
    'model_type': 'gpt2-yttm', # gpt2, ctrl, openai-gpt, xlnet, transfo-xl, xlm
    'model_name_or_path':'gpt2',
    'output_dir':'output',
    'block_size': 512,
    'learning_rate': 5e-7,
    'num_train_epochs': 5,
    'per_gpu_train_batch_size': 2,
    'gradient_accumulation_steps': 16,
    'save_steps': 1000,
    'save_total_limit': 3,
    'logging_steps': 10,
    'warmup_samples': 500,
    'unfreeze_level': -1,
#     'max_steps': 20000,
}

train_outputs = [
    r'ru_gpt2/s_checkpoint-1900000', # 'ru_gpt2\m_checkpoint-3364613'
    'rus_test_1', # rus_jokes 1 1e-4 16 unfreeze 0
    'rus_test_2', # rus_jokes 1 5e-5 16 unfreeze 1
    'rus_test_3', # rus_jokes 1 5e-5 16 unfreeze 2
    'rus_test_4', # rus_jokes 1 1e-4 16 unfreeze 7
    'rus_test_5', # rus_jokes 2 5e-6 16 unfreeze -1
    'rus_test_6', # rus_qa_jokes 
    'rus_test_7', # rus_qa_jokes
]


train_flags = [
    'do_train',
    'overwrite_output_dir',
    'lr_decay',
#     'fp16',
]

In [178]:
# train_kwargs['train_data_file'] = rus_stories_prep_outpath
# train_kwargs['train_data_file'] = rus_jokes_prep_outpath
train_kwargs['train_data_file'] = rus_qa_jokes_prep_outpath
print(train_kwargs['train_data_file'])

..\data\prep\rus_qa_jokes_gpt2.txt


In [179]:
# train_kwargs['model_name_or_path'] = train_outputs[0]
train_kwargs['model_name_or_path'] = os.path.join('models', train_outputs[6])

# train_kwargs['tokenizer_name'] = train_kwargs['tokenizer_name'].format(train_kwargs['model_name_or_path'])
train_kwargs['output_dir'] = os.path.join('models', train_outputs[7])
print('From:', train_kwargs['model_name_or_path'], '\nTo:', train_kwargs['output_dir'])

From: models\rus_test_6 
To: models\rus_test_7


In [180]:
cmd_command = create_cmd_command(python_path, script, train_kwargs, train_flags)
print(cmd_command)

C:\Users\Alex\Anaconda3\envs\pytorch\python.exe ru_transformers-master\run_lm_finetuning.py --model_type=gpt2-yttm --model_name_or_path=models\rus_test_6 --output_dir=models\rus_test_7 --block_size=512 --learning_rate=5e-07 --num_train_epochs=5 --per_gpu_train_batch_size=2 --gradient_accumulation_steps=16 --save_steps=1000 --save_total_limit=3 --logging_steps=10 --warmup_samples=500 --unfreeze_level=-1 --train_data_file=..\data\prep\rus_qa_jokes_gpt2.txt --do_train --overwrite_output_dir --lr_decay


In [None]:
    --do_eval \
    --evaluate_during_training \
    --eval_steps 1000 \
    --eval_data_file=./data/classic/valid \

#### Generate

In [24]:
gen_script = r'run_generation.py'
generate_kwargs = {
    'model_type': 'gpt2-yttm',
    'model_name_or_path': train_kwargs['output_dir'],
    'length': 200,
    'temperature': 0.9, # temperature of 1.0 has no effect, lower tend toward greedy sampling
    'stop_token': f'"{END_DOC_TOKEN}"',
    'top_k': 50,
    'top_p': 0.95,
    'num_return_sequences':20,
}
gen_flags = []

cmd_command = create_cmd_command(python_path, gen_script, generate_kwargs, gen_flags)
print(cmd_command)

C:\Users\Alex\Anaconda3\envs\pytorch\python.exe ru_transformers-master\run_generation.py --model_type=gpt2-yttm --model_name_or_path=models\output --length=200 --temperature=0.9 --stop_token="<|endoftext|>" --top_k=50 --top_p=0.95 --num_return_sequences=20 
