In [8]:
import re
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)




In [2]:
main_data = pd.read_csv('preprocessed_recipes.csv')

## tokenization and normlisation 

In [16]:
def normalize_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)                     # remove HTML
    text = re.sub(r"\s+", " ", text).strip()               # extra spaces
    text = re.sub(r"[^a-z0-9.,;:!?()\-/'\s]", "", text)    # keep punctuation
    return text

# apply normalization
main_data["question"] = main_data["question"].apply(normalize_text)
main_data["answer"] = main_data["answer"].apply(normalize_text)

main_data[['question','answer']].head(4)

Unnamed: 0,question,answer
0,how do i make low-carb hot morning cereal ?,1 1/2 cups almond flour 1/4 cup flax seed meal...
1,how do i make sunday afternoon tea quick pickl...,use a fork to run through the surface of the c...
2,how do i make candied yams 2 recipe?,put everything in a pot. stir. cover and cook ...
3,how do i make portuguese-style steamed mussels?,in a large (6 quart) casserole or soup pot ove...


## tokenisation 

In [19]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

#  STEP 4: Create HF Dataset and Tokenize
hf_dataset = Dataset.from_pandas(main_data[["question", "answer"]])

def tokenize_fn(example):
    input_text = "Question: " + example["question"] + " Answer: " + example["answer"]
    return tokenizer(
        input_text,
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized_dataset = hf_dataset.map(tokenize_fn)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [20]:
#  Split into Train / Validation / Test
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_val_split = split_dataset["train"].train_test_split(test_size=0.1, seed=42)

datasets = DatasetDict({
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": split_dataset["test"]
})

print(f"Train: {len(datasets['train'])}")
print(f"Validation: {len(datasets['validation'])}")
print(f"Test: {len(datasets['test'])}")

Train: 16200
Validation: 1800
Test: 2000


In [21]:
#  Load Model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

#  Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False   # because GPT-2 is causal LM, not masked
)

In [26]:

#Training Arguments
training_args = TrainingArguments(
    output_dir="./gpt2-african-recipes",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_strategy="steps",   # new argument name (replaces evaluation_strategy)
    eval_steps=500,
    push_to_hub=True,        # to upload to your Hugging Face repo
    hub_model_id="Christin1234/recipes_chatbot",  # your repo name
    hub_strategy="every_save"
)


In [27]:

#  Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

#  Train the Model
trainer.train()

  trainer = Trainer(


KeyboardInterrupt: 

In [None]:
# ✅ STEP 11: Evaluate on Test Set
results = trainer.evaluate(datasets["test"])
print("Test Results:", results)

# ✅ STEP 12: Push Model to Hugging Face Hub
trainer.push_to_hub()
tokenizer.push_to_hub("Eleka/qa-beans-gpt2")

In [None]:
# from transformers import GPT2Tokenizer

# # Load GPT-2 tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a pad token by default

# # Test tokenization on one example
# sample_input = "Question: " + main_data['question'][0] + " Answer: " + main_data['answer'][0]
# tokens = tokenizer(sample_input, truncation=True, padding='max_length', max_length=256, return_tensors="tf")
# print(tokens)


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


{'input_ids': <tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[24361,    25,   703,   466,  1312,   787,  1877,    12, 35684,
         3024,  3329, 33158,  5633, 23998,    25,   352,   352,    14,
           17, 14180, 26948, 10601,   352,    14,    19,  6508,   781,
          897,  9403,  9799,   352,    14,    19,  6508,   267,   265,
          865,   272,   352,    14,    17, 23053,  8268,   352,    14,
           17, 23053, 16871, 20584,   352,    14,    17, 23053, 16871,
        11913,   352,   352,    14,    18, 14180, 17797,  7545,   513,
        33107,   460,  5708,  3056,   352,    14,    17,   269,  4171,
        20853, 22870,   286, 16858,  1312,  1234,   287,   352,    14,
           17, 23053,   286,  8390,  7543, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 

In [5]:
!pip install --upgrade transformers




In [7]:
from datasets import Dataset

# Convert pandas dataframe to Hugging Face dataset
hf_dataset = Dataset.from_pandas(main_data[['question','answer']])

# Define tokenization function
def tokenize_fn(example):
    input_text = "Question: " + example['question'] + " Answer: " + example['answer']
    return tokenizer(input_text, truncation=True, padding='max_length', max_length=256)

# Apply to the entire dataset
tokenized_dataset = hf_dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

TypeError: can only concatenate str (not "list") to str

In [None]:
import tf_keras as keras
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # mlm=False for GPT-2

training_args = TrainingArguments(
    output_dir="./gpt2-african-recipes",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=500,
    push_to_hub=True
)





NameError: name 'tokenizer' is not defined