In [1]:
pip install datasets transformers==4.43.3

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers==4.43.3
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)
[

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
from sklearn.model_selection import train_test_split

#load dataset
df = pd.read_json("hf://datasets/chibbss/fitness-chat-prompt-completion-dataset/fitness-chat-prompt-completion-dataset.json")

# Prepare the dataset
df['text'] = df['instruction'] + " " + df['output']
train_df, test_df = train_test_split(df[['text']], test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df['text'].head()

Unnamed: 0,text
0,What are some practical steps I can take to im...
1,What is a balanced diet and how can I ensure I...
2,What are some effective strategies for incorpo...
3,How can I manage stress and maintain a healthy...
4,What are some tips for getting better quality ...


In [7]:
# # Define the path where you want to save the file in Google Drive
file_path = '/content/drive/My Drive/fitnessqa.csv'

# # Save DataFrame to CSV file in Google Drive
df.to_csv(file_path, index=False)

In [8]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Add a padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
def tokenize_function(examples):
    encodings = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

    encodings['labels'] = encodings['input_ids'].copy()
    return encodings

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Ensuring the datasets are formatted correctly
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results1",
    num_train_epochs=150,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=5,
    learning_rate=3e-5,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    weight_decay=0.01,
    load_best_model_at_end=True,
    fp16=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping if no improvement for 3 evals
)

# Fine-tune the model
trainer.train()


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss
200,1.069,1.298482
400,0.5808,1.417269
600,0.4044,1.621534
800,0.314,1.788438


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=800, training_loss=0.7659016448259354, metrics={'train_runtime': 88.7141, 'train_samples_per_second': 331.402, 'train_steps_per_second': 82.85, 'total_flos': 209033625600000.0, 'train_loss': 0.7659016448259354, 'epoch': 16.3265306122449})

In [9]:
# Load the best model checkpoint manually
best_checkpoint = "./results1/checkpoint-800"
model = AutoModelForCausalLM.from_pretrained(best_checkpoint)

# Save this best model
model.save_pretrained("./fine-tuned-gpt2-best")
tokenizer.save_pretrained("./fine-tuned-gpt2-best")

('./fine-tuned-gpt2-best/tokenizer_config.json',
 './fine-tuned-gpt2-best/special_tokens_map.json',
 './fine-tuned-gpt2-best/vocab.json',
 './fine-tuned-gpt2-best/merges.txt',
 './fine-tuned-gpt2-best/added_tokens.json',
 './fine-tuned-gpt2-best/tokenizer.json')

In [10]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-gpt2-best"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Function to generate a response
def generate_response(prompt, max_length=150, num_beams=5, early_stopping=True):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        num_return_sequences=1,
        num_beams=num_beams,  # Beam search for better quality
        early_stopping=early_stopping,
        pad_token_id=tokenizer.eos_token_id  # Set pad token id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the chatbot with some example questions
example_questions = [
    "What is a healthy diet?"
]

# Generate and print responses for the example questions
for question in example_questions:
    print(f"Question: {question}")
    print(f"Response: {generate_response(question)}")
    print("-" * 50)

Question: What is a healthy diet?
Response: What is a healthy diet? A balanced diet is one that includes all the essential nutrients that your body needs to function properly. It should include fresh fruits, vegetables, whole grains, lean proteins, and healthy fats. It should also include a variety of whole grains, lean proteins, and healthy fats. It should also include a variety of whole fruits, vegetables, lean proteins, and healthy fats. It should also include a variety of whole grains, lean proteins, and healthy fats. It should also include a variety of whole fruits, vegetables, lean proteins, and healthy fats. It should also include a variety of whole fruits, vegetables, lean proteins, and healthy fats. It should also include a variety of whole grains, lean proteins, and healthy fats
--------------------------------------------------


In [11]:
# Test the chatbot with some example questions
example_questions = [
    "How to loose 10kg in a month?"
]

# Generate and print responses for the example questions
for question in example_questions:
    print(f"Question: {question}")
    print(f"Response: {generate_response(question)}")
    print("-" * 50)

Question: How to loose 10kg in a month?
Response: How to loose 10kg in a month? Breaking a weight loss plateau can be challenging, but it's manageable. Set achievable goals, track progress, and celebrate small victories along the way. Remember how great you feel after a tough workout. Remember how great you feel after.
--------------------------------------------------


In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer
model_id = "/content/drive/MyDrive/Colab Notebooks/fine-tuned-gpt2-best"  # Replace with your model directory

tokenizer = GPT2Tokenizer.from_pretrained(model_id)

# Load the model
model = GPT2LMHeadModel.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # or torch.float32, depending on your hardware support
    low_cpu_mem_usage=True,
)

# Optionally, move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate text using the fine-tuned model
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
prompt = "what is balanced diet?"
generated_text = generate_text(prompt)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


what is balanced diet? Balanced diet is one that includes all the essential nutrients that your body needs to function properly. It includes a variety of whole grains, fruits, vegetables, lean proteins, and healthy fats. It also includes a variety of whole grains, lean proteins, and healthy fats. You can choose from a variety of healthy fats, carbohydrates, and water sources to support your overall health and well-being.
