In [None]:
from transformers import GPT2Tokenizer,TextDataset,DataCollatorForLanguageModeling,GPT2LMHeadModel, pipeline, Trainer, TrainingArguments

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset
import pandas as pd

In [None]:
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

# Now, you can use the file path "/content/drive/My Drive/" to access your Google Drive files.


In [None]:
data=pd.read_csv('/content/drive/My Drive/english_to_latex.csv')
print(data.shape)
data.head(2)

In [None]:
tokenizer=GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token=tokenizer.eos_token

In [None]:
## Add our Singular prompt
CONVERSION_PROMPT='LCT\n'
CONVERSION_TOKEN='LaTeX:'

In [None]:
## This is training prompt that we want GPT to rcognize and learn

training_examples=f'{CONVERSION_PROMPT}English: '+data['English']+ '\n'+ CONVERSION_TOKEN+' '+ data['LaTeX']

In [None]:
print(training_examples[0])

In [None]:
task_df=pd.DataFrame({'text':training_examples})
task_df.head(2)

In [None]:
latex_data=Dataset.from_pandas(task_df) ## Turn a pandas df into a dataset
def preprocess(examples):
    return tokenizer(examples['text'],truncation=True)

latex_data=latex_data.map(preprocess,batched=True)
latex_data=latex_data.train_test_split(train_size=0.8)


In [None]:
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

In [None]:
latex_gpt2=GPT2LMHeadModel.from_pretrained('gpt2')

In [None]:
pip install accelerate -U

In [None]:
training_args=TrainingArguments(
    output_dir='./gpt2_latex',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=20,
    ##warmup_steps=len(pds_data.examples)//5,
    logging_steps=5,
    log_level='info',
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

In [None]:
trainer=Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=latex_data['train'],
    eval_dataset=latex_data['test']
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
calculus_data=TextDataset(
    tokenizer=tokenizer,
    file_path='/content/drive/My Drive/calculus made easy.txt',
    block_size=32
)

data_collator=DataCollatorForLanguageModeling(
    tokenizer=tokenizer,mlm=False
)

latext_gpt2=GPT2LMHeadModel.from_pretrained('gpt2')

training_args=TrainingArguments(
    output_dir='/content/drive/My Drive/gpt2_calculus',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    ##warmup_steps=len(pds_data.examples)//5,
    logging_steps=50,
    log_level='info',
    load_best_model_at_end=True,
    evaluation_strategy='steps',
    save_strategy='steps'
)

In [None]:
trainer=Trainer(
    model=latex_gpt2,
    args=training_args,
    data_collator=data_collator,
    train_dataset=calculus_data.examples[:int(len(calculus_data.examples)*0.8)],
    eval_dataset=calculus_data.examples[int(len(calculus_data.examples)*0.8):]
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.save_model()

In [None]:
loaded_model=GPT2LMHeadModel.from_pretrained('/content/drive/My Drive/gpt2_calculus')
latex_generator=pipeline('text-generation',model=loaded_model,tokenizer=tokenizer)

In [None]:
text_sample='f of x equals integral from 0 to pi of x to the fourth power'
conversion_text_sample=f'{CONVERSION_PROMPT}English: {text_sample}\n{CONVERSION_TOKEN}'
print(conversion_text_sample)

In [None]:
print(latex_generator(conversion_text_sample,num_beams=5,early_stopping=True,temperature=0.7,
                      max_length=len(tokenizer.encode(conversion_text_sample))+20)[0]['generated_text'])

In [None]:
training_args=TrainingArguments(
    output_dir='./gpt2_latex_calculus',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=20,
    ##warmup_steps=len(pds_data.examples)//5,
    logging_steps=5,
    log_level='info',
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)
trainer=Trainer(
    model=loaded_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=latex_data['train'],
    eval_dataset=latex_data['test']
)

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
loaded_model=GPT2LMHeadModel.from_pretrained('./gpt2_latex_calculus')
latex_generator_f=pipeline('text-generation',model=loaded_model,tokenizer=tokenizer)

In [None]:
print(latex_generator_f(conversion_text_sample,num_beams=5,early_stopping=True,temperature=0.7,
                      max_length=len(tokenizer.encode(conversion_text_sample))+20)[0]['generated_text'])