In [1]:
import torch
from transformers import pipeline, set_seed, GPT2Tokenizer,GPT2LMHeadModel, TextDataset, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import pandas as pd



#### Finetuning on extract-from-how-google-works book

In [2]:
import shutil

source_file_path = '/kaggle/input/extract-from-how-google-works-eric-schmidt/How Google Works.txt'
destination_file_path = '/kaggle/working/How Google Works.txt'

# Copy the file
shutil.copy(source_file_path, destination_file_path)

print(f'The file has been copied from {source_file_path} to {destination_file_path}.')


The file has been copied from /kaggle/input/extract-from-how-google-works-eric-schmidt/How Google Works.txt to /kaggle/working/How Google Works.txt.


In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:
pds_data = TextDataset(
            tokenizer=tokenizer,
            file_path='/kaggle/working/How Google Works.txt',
            block_size=32
            )



In [5]:
pds_data[0],pds_data[0].shape

(tensor([44140, 11725,   198, 10962,   286, 26714,   198,  9980, 15653,   198,
         15269,  7873,   198,   818, 10213,   351,   262,   471,    13,    50,
            13, 15069,  2191,   286, 15408,    11,   262, 21976,    11, 33794,
            11,   290]),
 torch.Size([32]))

In [6]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

In [19]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

pretrianed_generator = pipeline(
    'text-generation',model=model,tokenizer='gpt2',
     config={'max_length':200,'do_sample':True,'top_p':0.9,'temperature':0.7,'top_k':10}   
)

In [20]:
# Intial resposne of model without any context of our book
for gen_seq in pretrianed_generator("How does data-driven insights contribute to the Google's success ",num_return_sequences=3):
    print(gen_seq['generated_text'])
    print("===========")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How does data-driven insights contribute to the Google's success " asked Google's co- alleged co-founder Larry Page III.

Page's response went as follows: "(I know) I am not the only one who's thought this
How does data-driven insights contribute to the Google's success " asks Brian Sibbie, Google Analytics Head of Marketing. "We're looking at the ways people respond to what we know is going on. In those circumstances, we'll look
How does data-driven insights contribute to the Google's success "

Yes, Google's search engine was great. But now, with more and more people looking for that thing, it's more and more difficult for Google to do anything more


In [9]:
training_args = TrainingArguments(
                output_dir="./gpt2_pds",
                overwrite_output_dir = True,
                num_train_epochs=7,
                per_device_train_batch_size=32,
                per_device_eval_batch_size=32,
                warmup_steps=len(pds_data.examples),
                logging_steps=50,
                load_best_model_at_end=True,
                evaluation_strategy='epoch',
                save_strategy='epoch'                    
                )

trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=pds_data.examples[:int(len(pds_data.examples)*.8)], #first 80% of our data
            eval_dataset=pds_data.examples[int(len(pds_data.examples)*.8):]
            )

In [10]:
trainer.evaluate()



[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'eval_loss': 4.215001583099365,
 'eval_runtime': 7.1546,
 'eval_samples_per_second': 99.377,
 'eval_steps_per_second': 1.677}

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,4.199103
2,4.411400,4.157639
3,4.333600,4.108166
4,4.215400,4.059489
5,4.120200,4.032833
6,4.042500,4.0125
7,3.957000,3.997054


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=315, training_loss=4.169738103472997, metrics={'train_runtime': 162.9273, 'train_samples_per_second': 122.104, 'train_steps_per_second': 1.933, 'total_flos': 324883980288000.0, 'train_loss': 4.169738103472997, 'epoch': 7.0})

In [12]:
trainer.evaluate()



{'eval_loss': 3.997053861618042,
 'eval_runtime': 1.7957,
 'eval_samples_per_second': 395.95,
 'eval_steps_per_second': 6.683,
 'epoch': 7.0}

In [17]:
fine_tuned_generator = pipeline(
    'text-generation',model=model,tokenizer=tokenizer,device=0,
     config={'max_length':300,'do_sample':True,'top_p':0.9,'temperature':0.7,'top_k':10}   
)

for gen_seq in fine_tuned_generator("How does data-driven insights contribute to the Google's success "",num_return_sequences=3):
    print(gen_seq['generated_text'])
    print("===========")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


How does data-driven insights contribute to the company's success " asked the CEO.
 "Let us start with the basic idea," said Tim. "It's like creating a new world, right in your living room. There are sensors and
How does data-driven insights contribute to the company's success  ? What is the process for this? The answer is in the organization.  I think that the best managers want managers who have a strong personal stake in their performance   
How does data-driven insights contribute to the company's success  or how does the team decide who gets to hire who? The answer is a combination of how important the insights are to the decision making process, but also an understanding of how people


#### Finetuning on LaTex data
A fine-tuned GPT-2 model that take in the description of an equation in English and output the LaTeX to render that equation.

In [24]:
from datasets import Dataset
import pandas as pd

data = pd.read_csv('/kaggle/input/english-latex-50/equations_dataset.csv')

data.head(2)

Unnamed: 0,English,Latex
0,Integral from 0 to 1 of x,"\int_{0}^{1} x\,dx"
1,Sum from i equals 1 to n of i,\sum_{i=1}^{n} i


In [25]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

CONVERSION_PROMPT = 'Latex Conversion Task\n'

CONVERSION_TOKEN = 'LaTeX:'

In [26]:
training_examples = f'{CONVERSION_PROMPT}English: ' + data['English'] + '\n' + CONVERSION_TOKEN + ' ' + data['Latex'].astype(str)

print(training_examples[0])

Latex Conversion Task
English: Integral from 0 to 1 of x
LaTeX:  \int_{0}^{1} x\,dx


## \int_{0}^{1} x\,dx -> $\int_{0}^{1} x\,dx$

In [27]:
task_df = pd.DataFrame({'text':training_examples})

task_df.head(2)

Unnamed: 0,text
0,Latex Conversion Task\nEnglish: Integral from ...
1,Latex Conversion Task\nEnglish: Sum from i equ...


In [28]:
latex_data = Dataset.from_pandas(task_df)

def preprocess(data):
        return tokenizer(data['text'], truncation=True)
    
latex_data = latex_data.map(preprocess,batched=True)    

latex_data = latex_data.train_test_split(train_size=.8)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [29]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

In [30]:
latex_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
                output_dir="./latex_gpt2",
                overwrite_output_dir = True,
                num_train_epochs=10,
                per_device_train_batch_size=2,
                per_device_eval_batch_size=20,                
                logging_steps=5,
                log_level='info',
                load_best_model_at_end=True,
                evaluation_strategy='epoch',
                save_strategy='epoch'                    
                )

trainer = Trainer(
            model=latex_gpt2,
            args=training_args,
            data_collator=data_collator,
            train_dataset=latex_data['train'],
            eval_dataset=latex_data['test']
            )

trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11
  Batch size = 40


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


{'eval_loss': 4.243175029754639,
 'eval_runtime': 0.1162,
 'eval_samples_per_second': 94.644,
 'eval_steps_per_second': 8.604}

In [31]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 41
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Training with DataParallel so batch size has been adjusted to: 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 110
  Number of trainable parameters = 124,439,808


Epoch,Training Loss,Validation Loss
1,2.8978,1.836686
2,1.9864,1.606321
3,1.6595,1.519372
4,1.4087,1.594503
5,1.2239,1.512287
6,1.2203,1.587681
7,0.9181,1.586797
8,1.0311,1.581843
9,0.8961,1.598251
10,0.8696,1.619587


The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11
  Batch size = 40
Saving model checkpoint to ./latex_gpt2/tmp-checkpoint-11
Configuration saved in ./latex_gpt2/tmp-checkpoint-11/config.json
Configuration saved in ./latex_gpt2/tmp-checkpoint-11/generation_config.json
Model weights saved in ./latex_gpt2/tmp-checkpoint-11/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 11
  Batch size = 40
Saving model checkpoint to ./latex_gpt2/tmp-checkpoint-22
Configuration saved in ./latex_gpt2/tmp-checkpoint-22/config.json
Confi

TrainOutput(global_step=110, training_loss=1.4559958046132868, metrics={'train_runtime': 54.7292, 'train_samples_per_second': 7.491, 'train_steps_per_second': 2.01, 'total_flos': 10668063744000.0, 'train_loss': 1.4559958046132868, 'epoch': 10.0})

In [None]:
trainer.evaluate()

In [41]:
test_example = 'integral from O to pi of x to the fourth power'

conversion_test_ex = f'{CONVERSION_PROMPT}Eng1ish: {test_example}\n{CONVERSION_TOKEN}'

print(conversion_test_ex)

Latex Conversion Task
Eng1ish: integral from O to pi of x to the fourth power
LaTeX:


In [42]:
latex_generator = pipeline(
    'text-generation',model=latex_gpt2,tokenizer=tokenizer,device=0     
)

In [43]:
print(latex_generator(conversion_test_ex,num_beams=5,early_stopping=True,temperature=0.7,max_length=len(tokenizer.encode(conversion_test_ex)) + 20)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Latex Conversion Task
Eng1ish: integral from O to pi of x to the fourth power
LaTeX:  \int_{-\infty}^{\infty}^{-\infty}
