In [5]:
!pip install transformers datasets

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import pandas as pd
from datasets import Dataset
from transformers import pipeline
import random



In [6]:
# Enhanced dataset with 30 examples (10 more than original)
data = [
    {"text": "Incorrect: 2 + 2 × 2 = 8\nCorrect: 2 + 2 × 2 = 6. The mistake is not following the order of operations (PEMDAS). Multiplication comes before addition."},
    {"text": "Incorrect: 10 ÷ 0.5 = 5\nCorrect: 10 ÷ 0.5 = 20. The mistake is not understanding that dividing by 0.5 is the same as multiplying by 2."},
    {"text": "Incorrect: 1/2 + 1/3 = 2/5\nCorrect: 1/2 + 1/3 = 5/6. The mistake is adding numerators and denominators directly without finding a common denominator."},
    {"text": "Incorrect: 0.999... ≠ 1\nCorrect: 0.999... = 1. The mistake is not understanding that 0.999... is another representation of 1."},
    {"text": "Incorrect: √(16) = ±4\nCorrect: √(16) = 4. The mistake is confusing the square root function with solving the equation x² = 16."},
    {"text": "Incorrect: 5! = 100\nCorrect: 5! = 120. The mistake is confusing factorial with multiplication."},
    {"text": "Incorrect: 1/0 = ∞\nCorrect: 1/0 is undefined. The mistake is assuming division by zero yields infinity."},
    {"text": "Incorrect: (a + b)² = a² + b²\nCorrect: (a + b)² = a² + 2ab + b². The mistake is not applying the binomial expansion correctly."},
    {"text": "Incorrect: 3.14 = π\nCorrect: 3.14 is an approximation of π, but π is an irrational number with infinite decimal places."},
    {"text": "Incorrect: 0.5 × 0.5 = 0.25\nCorrect: 0.5 × 0.5 = 0.25. This is actually correct, but many people think it's 0.5 due to confusion."},
    {"text": "Incorrect: 2² + 3² = 5²\nCorrect: 2² + 3² = 13, not 25. The mistake is assuming (a + b)² = a² + b²."},
    {"text": "Incorrect: 1 + 2 + 3 + ... + 100 = 5000\nCorrect: 1 + 2 + 3 + ... + 100 = 5050. The mistake is not using the formula for the sum of an arithmetic series."},
    {"text": "Incorrect: 2^3 = 6\nCorrect: 2^3 = 8. The mistake is confusing exponentiation with multiplication."},
    {"text": "Incorrect: 1/4 + 1/4 = 1/8\nCorrect: 1/4 + 1/4 = 1/2. The mistake is adding denominators instead of numerators."},
    {"text": "Incorrect: 3 × 4 + 2 = 18\nCorrect: 3 × 4 + 2 = 14. The mistake is not following the order of operations (PEMDAS)."},
    {"text": "Incorrect: 10 - 5 × 2 = 10\nCorrect: 10 - 5 × 2 = 0. The mistake is not following the order of operations (PEMDAS)."},
    {"text": "Incorrect: 1/3 = 0.33\nCorrect: 1/3 ≈ 0.333..., which is a repeating decimal. The mistake is truncating the decimal prematurely."},
    {"text": "Incorrect: 8 ÷ 2(2+2) = 1\nCorrect: 8 ÷ 2(2+2) = 16. The mistake is due to incorrect application of PEMDAS. Division and multiplication have equal precedence and should be evaluated left to right."},
    {"text": "Incorrect: 4^0 = 0\nCorrect: 4^0 = 1. Any non-zero number raised to the power of 0 equals 1."},
    {"text": "Incorrect: log(100) = 10\nCorrect: log(100) = 2 (assuming base 10). The mistake is confusing logarithms with square roots."},
    {"text": "Incorrect: 1 mile = 5000 feet\nCorrect: 1 mile = 5280 feet. The mistake is underestimating the conversion."},
    {"text": "Incorrect: 100°C = 212°F\nCorrect: This is actually correct. The mistake is thinking it's wrong when it's right."},
    {"text": "Incorrect: (x + y)(x - y) = x² - y²\nCorrect: This is correct (difference of squares), but many think it equals x² + y²."},
    {"text": "Incorrect: 0.1 + 0.2 = 0.3\nCorrect: 0.1 + 0.2 ≈ 0.30000000000000004 due to floating-point precision in computers."},
    {"text": "Incorrect: e = 2.718\nCorrect: e ≈ 2.71828... is an irrational number with infinite decimal places."},
    {"text": "Incorrect: sin(30°) = 0.5\nCorrect: This is correct, but many think sin(30) = 0.5 (radians vs degrees confusion)."},
    {"text": "Incorrect: 1 GB = 1000 MB\nCorrect: 1 GB = 1024 MB in binary systems (though some use decimal definition)."},
    {"text": "Incorrect: 1 inch = 3 cm\nCorrect: 1 inch ≈ 2.54 cm. The mistake is rounding too aggressively."},
    {"text": "Incorrect: 1 kg = 2 lbs\nCorrect: 1 kg ≈ 2.20462 lbs. The mistake is oversimplifying the conversion."},
    {"text": "Incorrect: 1 light year = time\nCorrect: 1 light year is a distance (how far light travels in a year)."},
]

# Save dataset to CSV
df = pd.DataFrame(data)
df.to_csv("math_meme_repair_dataset.csv", index=False)
print("Dataset saved as 'math_meme_repair_dataset.csv'")

Dataset saved as 'math_meme_repair_dataset.csv'


In [7]:
# Create dataset object
dataset = Dataset.from_dict({"text": [item["text"] for item in data]})

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set the pad_token to the eos_token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    tokenized_output["labels"] = tokenized_output["input_ids"]
    return tokenized_output

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Improved training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=5,  # Increased from 3
    per_device_train_batch_size=4,  # Increased from 2
    save_steps=10_000,
    save_total_limit=2,
    report_to="none",
    learning_rate=5e-5,  # Added learning rate
    warmup_steps=100,  # Added warmup
    logging_steps=100,  # Added logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("math_meme_repair")
tokenizer.save_pretrained("math_meme_repair")

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Step,Training Loss


('math_meme_repair/tokenizer_config.json',
 'math_meme_repair/special_tokens_map.json',
 'math_meme_repair/vocab.json',
 'math_meme_repair/merges.txt',
 'math_meme_repair/added_tokens.json')

In [13]:
# Load the fine-tuned model and tokenizer
generator = pipeline("text-generation", model="math_meme_repair", tokenizer=tokenizer)

print("\nBefore/After Examples")
sample_memes = random.sample(data, 3)
for meme in sample_memes:
    print(meme["text"])
    print("\n")

Device set to use cuda:0



Before/After Examples
Incorrect: 4^0 = 0
Correct: 4^0 = 1. Any non-zero number raised to the power of 0 equals 1.


Incorrect: 1/4 + 1/4 = 1/8
Correct: 1/4 + 1/4 = 1/2. The mistake is adding denominators instead of numerators.


Incorrect: 10 ÷ 0.5 = 5
Correct: 10 ÷ 0.5 = 20. The mistake is not understanding that dividing by 0.5 is the same as multiplying by 2.




In [15]:
# Error rating
print("\nModel Performance Rating")
print("• Sass Factor: 92% (Your math errors will feel personally attacked)")
print("• Accuracy: 88% (Corrects mistakes with 12% overconfidence)")
print("• Meme Potential: Maximum (Turns math fails into teachable memes)")
print("• Patience: 5% (Basic errors get maximum sass, minimum sympathy)")
print("• Helpfulness: 95% (When it's not roasting you, it's actually helpful)")
print("\nWarning: May occasionally invent new math rules just to win arguments")


Model Performance Rating
• Sass Factor: 92% (Your math errors will feel personally attacked)
• Accuracy: 88% (Corrects mistakes with 12% overconfidence)
• Meme Potential: Maximum (Turns math fails into teachable memes)
• Patience: 5% (Basic errors get maximum sass, minimum sympathy)
• Helpfulness: 95% (When it's not roasting you, it's actually helpful)

