<a href="https://colab.research.google.com/github/BTejas001/MWPS_V2-using-GPT2-model/blob/main/gpt2_model_fine_tuning_own_data_try2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch

# Load and prepare data (same as before)
data = pd.read_csv('/content/mathwp.csv')

def prepare_data(row):
    return f"Problem: {row['Problems']} Equation: {row['Equations']} Answer: {row['Answers']} <|endoftext|>"

data['text'] = data.apply(prepare_data, axis=1)
train_texts, val_texts = train_test_split(data['text'].tolist(), test_size=0.1)

with open('train.txt', 'w') as f:
    f.write('\n'.join(train_texts))
with open('val.txt', 'w') as f:
    f.write('\n'.join(val_texts))

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token to tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Create datasets and data collator (same as before)
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=128)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="val.txt",
    block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-arithmetic",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=200,
    save_steps=400,
    warmup_steps=1000,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Save the model and tokenizer
model_save_path = "./gpt2-arithmetic-final"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

# Function to load the model and tokenizer
def load_model_and_tokenizer(model_path):
    loaded_model = GPT2LMHeadModel.from_pretrained(model_path)
    loaded_tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    loaded_tokenizer.pad_token = loaded_tokenizer.eos_token
    return loaded_model, loaded_tokenizer



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss


Model and tokenizer saved to ./gpt2-arithmetic-final


In [2]:
# Function to generate equations for new problems
def generate_equation(problem, model, tokenizer):
    input_text = f"Problem: {problem} Equation:"
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    output = model.generate(input_ids,
                            max_length=50,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2,
                            do_sample=True,
                            top_k=50,
                            top_p=0.95,
                            temperature=0.7,
                            early_stopping=True)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    equation = generated_text.split("Equation:")[1].strip()
    return equation

# Example usage after training
print("Testing the freshly trained model:")
new_problem = "Sam has 3 pants. he buys 7 more pants. how many pants does sam have now?"
generated_equation = generate_equation(new_problem, model, tokenizer)
print(f"Problem: {new_problem}")
print(f"Generated Equation: {generated_equation}")

# Example of loading and using the saved model
print("\nTesting the loaded model:")
loaded_model, loaded_tokenizer = load_model_and_tokenizer(model_save_path)
loaded_generated_equation = generate_equation(new_problem, loaded_model, loaded_tokenizer)
print(f"Problem: {new_problem}")
print(f"Generated Equation: {loaded_generated_equation}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Testing the freshly trained model:
Problem: Sam has 3 pants. he buys 7 more pants. how many pants does sam have now?
Generated Equation: 3 + 7 Answer: 8 __________________________________________________________________________________________________________________________________________ _______________ ______________________________________________ _____________________ __________________________ | 5

Testing the loaded model:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Problem: Sam has 3 pants. he buys 7 more pants. how many pants does sam have now?
Generated Equation: 3 + 7 Answer: 10 --------------------------- - 8 --------------- Answer : 11 ------------ -------------- ------------- _______________________________________________________________________________


In [None]:
# def clean_equation(equation):
#     # Remove any text after the first period
#     equation = equation.split('.')[0]
#     # Remove any non-mathematical characters
#     equation = ''.join(char for char in equation if char.isdigit() or char in '+-*/=().')
#     return equation

In [None]:
# def evaluate_model(model, tokenizer, test_data):
#     correct = 0
#     total = len(test_data)
#     for problem, true_equation in test_data:
#         generated_eq = generate_equation(problem, model, tokenizer)
#         generated_eq = clean_equation(generated_eq)
#         if generated_eq == true_equation:
#             correct += 1
#     accuracy = correct / total
#     print(f"Model accuracy: {accuracy:.2f}")