Installing Required Libraries

In [None]:
!pip install datasets
!pip install evaluate
!pip install gradio
!pip install rouge_score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Preprocessing Data, Training GPT-2 and Bert

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch

# Step 1: Load BERT tokenizer and model for classification (Supplements vs Exercises)
tokenizer_bert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_bert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Step 2: Load GPT-2 tokenizer and model for answer generation
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token
model_gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")

# Load datasets
exercises_df = pd.read_csv('/content/fitness_exercises - fitness_exercises.csv')
supplements_df = pd.read_csv('/content/supplements.csv')

# Preprocess data for GPT-2 fine-tuning
exercise_qna = []
for _, row in exercises_df.iterrows():
    exercise_name = row['Exercise Name']
    description = row['Description'] if row['Description'] != 'No Description' else 'No description available'
    exercise_qna.append(f"Q: What is the exercise {exercise_name}? A: {description}")

supplement_qna = []
for _, row in supplements_df.iterrows():
    product_title = row['Product Title']
    description = row['Product Description'] if row['Product Description'] != 'No Description' else 'No description available'
    supplement_qna.append(f"Q: What is {product_title}? A: {description}")

# Combine the Q&A pairs into a single list
qna_data = exercise_qna + supplement_qna

# Convert to Hugging Face Dataset
gpt2_dataset = Dataset.from_dict({"text": qna_data})

# Tokenize the dataset for GPT-2
def tokenize_gpt2(examples):
    # Tokenize and shift labels for language modeling (labels = input_ids shifted by one)
    encodings = tokenizer_gpt2(examples['text'], truncation=True, padding=True, max_length=512)
    input_ids = encodings['input_ids']
    labels = input_ids.copy()  # Labels are the same as input_ids
    for i in range(len(input_ids)):
        labels[i][1:] = input_ids[i][:-1]  # Shift the labels by one to the right
    encodings['labels'] = labels
    return encodings

tokenized_gpt2_data = gpt2_dataset.map(tokenize_gpt2, batched=True)

# Fine-tune GPT-2
training_args_gpt2 = TrainingArguments(
    output_dir="./gpt2_finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=1000,
    logging_steps=100,
    logging_dir="./logs",
    save_total_limit=2,
    learning_rate=5e-5,
)

trainer_gpt2 = Trainer(
    model=model_gpt2,
    args=training_args_gpt2,
    train_dataset=tokenized_gpt2_data,
)

trainer_gpt2.train()

# Save the fine-tuned GPT-2 model
trainer_gpt2.save_model("./gpt2_finetuned")
tokenizer_gpt2.save_pretrained("./gpt2_finetuned")

# Prepare data for BERT classification (Supplements vs Exercises)
exercises_df['label'] = 1  # Exercise label
supplements_df['label'] = 0  # Supplements label

# Combine datasets and convert to Hugging Face Dataset
combined_df = pd.concat([exercises_df[['Description', 'label']], supplements_df[['Product Description', 'label']]])

# Clean invalid or missing descriptions
combined_df['Description'] = combined_df['Description'].apply(lambda x: 'No description available' if x == 'No Description' or pd.isnull(x) else x)
combined_df['Product Description'] = combined_df['Product Description'].apply(lambda x: 'No description available' if x == 'No Description' or pd.isnull(x) else x)

# Rename columns for uniformity
combined_df = combined_df.rename(columns={'Description': 'text'})  # Rename columns for uniformity

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(combined_df)

# Split the dataset into train and test
train_test_split = dataset.train_test_split(test_size=0.2)

# Tokenize the dataset for BERT
def tokenize_function(examples):
    if isinstance(examples['text'], list):
        texts = examples['text']
    else:
        texts = [examples['text']]
    print(f"Tokenizing the following texts (first 3): {texts[:3]}")

    return tokenizer_bert(texts, truncation=True, padding=True, max_length=512)

# Map the tokenization function to the dataset
tokenized_data = train_test_split.map(tokenize_function, batched=True)

# Fine-tune BERT for classification
model_bert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

training_args_bert = TrainingArguments(
    output_dir="./bert_finetuned",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=1000,
    logging_steps=100,
    evaluation_strategy="epoch",
    logging_dir="./logs_bert",
    save_total_limit=2,
    learning_rate=5e-5,
)

trainer_bert = Trainer(
    model=model_bert,
    args=training_args_bert,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(axis=-1) == p.label_ids).mean()},
)

trainer_bert.train()

# Save the fine-tuned BERT model
trainer_bert.save_model("./bert_finetuned")
tokenizer_bert.save_pretrained("./bert_finetuned")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/599 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
100,0.4894
200,0.2548
300,0.2159
400,0.1929
500,0.1818
600,0.1944
700,0.1632


Map:   0%|          | 0/479 [00:00<?, ? examples/s]

Tokenizing the following texts (first 3): ['No description available', 'No description available', 'No description available']


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Tokenizing the following texts (first 3): ['No description available', 'No description available', 'No description available']


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.476503,0.75
2,0.504900,0.4721,0.75
3,0.504900,0.464317,0.75
4,0.493800,0.477629,0.75
5,0.490200,0.4949,0.75


('./bert_finetuned/tokenizer_config.json',
 './bert_finetuned/special_tokens_map.json',
 './bert_finetuned/vocab.txt',
 './bert_finetuned/added_tokens.json',
 './bert_finetuned/tokenizer.json')

Chatbot UI Through Gradio

In [None]:
import gradio as gr
# Load the fine-tuned GPT-2 model
gpt2_model_path = './gpt2_finetuned'  # Path to fine-tuned GPT-2
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained(gpt2_model_path)
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token  # Add padding token
model_gpt2 = GPT2LMHeadModel.from_pretrained(gpt2_model_path)

# Function to generate response
def generate_response(prompt):
    # Encode the input text
    inputs_gpt2 = tokenizer_gpt2.encode(prompt, return_tensors="pt", truncation=True, max_length=512)

    # Generate the response
    outputs_gpt2 = model_gpt2.generate(
        inputs_gpt2,
        max_length=600,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer_gpt2.eos_token_id,
        top_p=0.9,
        temperature=0.7
    )

    # Decode the output tokens
    response = tokenizer_gpt2.decode(outputs_gpt2[0], skip_special_tokens=True)
    return response
print(generate_response("What is the exercise Push-up?"))

with gr.Blocks(css=".chatbox {height: 500px; overflow-y: auto;}") as demo:
    gr.Markdown("<h1 style='text-align: center;'>FitFutureBot</h1><p style='text-align: center;'>Ask me about exercises, supplements, and more!</p>")
    chat_history = gr.Chatbot(label="FitFutureBot Chat History", elem_classes=["chatbox"])
    user_input = gr.Textbox(placeholder="Ask me anything...", label="Your Query", lines=2)
    clear_button = gr.Button("Clear Chat")

    # Define what happens when user submits a prompt
    def respond(message, history):
        response = generate_response(message)
        history = history + [(message, response)]
        return history, ""

    # Define event bindings
    user_input.submit(respond, [user_input, chat_history], [chat_history, user_input])
    clear_button.click(lambda: [], None, chat_history)

# Launch the Gradio app
demo.launch(share=True)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


What is the exercise Push-up? A Pushup is a great exercise for targeting the chest muscles, particularly the pectoralis major, as well as the obliques, rhomboids, and deltoids




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fbbf24759881936453.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Evaluating Fine-tuned GPT-2 Using Rouge, Bleu

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Function to generate responses using the fine-tuned GPT-2 model
def generate_response(input_text):
    inputs = tokenizer_gpt2(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model_gpt2.generate(**inputs, max_length=512, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.9, temperature=0.7)
    generated_text = tokenizer_gpt2.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

test_data = qna_data[:100]

# Evaluate GPT-2 using BLEU score and ROUGE score on the test subset
bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for qna in test_data:
    # Split Q&A into question and true answer
    if "A: " in qna:
        question, true_answer = qna.split(" A: ")
        generated_answer = generate_response(question + " A: ")

        # Tokenize both generated and true answers for BLEU calculation
        reference = [true_answer.split()]
        hypothesis = generated_answer.split()

        bleu_scores.append(sentence_bleu(reference, hypothesis, weights=(1, 0, 0, 0)))

        # Calculate ROUGE score
        rouge_score = scorer.score(true_answer, generated_answer)
        rouge1_scores.append(rouge_score["rouge1"].fmeasure)
        rouge2_scores.append(rouge_score["rouge2"].fmeasure)
        rougeL_scores.append(rouge_score["rougeL"].fmeasure)

# Compute average BLEU score
average_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

# Compute average ROUGE scores
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0
average_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0

print(f"Average BLEU Score for GPT-2 on the test set: {average_bleu}")
print(f"Average ROUGE-1 Score for GPT-2 on the test set: {average_rouge1}")
print(f"Average ROUGE-2 Score for GPT-2 on the test set: {average_rouge2}")
print(f"Average ROUGE-L Score for GPT-2 on the test set: {average_rougeL}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for ope

Average BLEU Score for GPT-2 on the test set: 0.2250002170638724
Average ROUGE-1 Score for GPT-2 on the test set: 0.3273422706989184
Average ROUGE-2 Score for GPT-2 on the test set: 0.1737351172115914
Average ROUGE-L Score for GPT-2 on the test set: 0.284888874071002


Test Usage

In [None]:
test_questions = [
    "What is the exercise Push-up?",
    "What is Vitamin C used for?",
    "What is the benefit of taking Creatine?",
    "How does Vitamin D work?",
    "What is the benefit of taking Whey Protein?"
]

# Generate and print responses for each question
for question in test_questions:
    print(f"Question: {question}")
    response = generate_response(question)
    print(f"Response: {response}\n")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Question: What is the exercise Push-up?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Response: What is the exercise Push-up? A Pushup is a great exercise for targeting the chest muscles, particularly the pectoralis major, as well as the obliques, rhomboids, and deltoids

Question: What is Vitamin C used for?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Response: What is Vitamin C used for? A: Vitamin c is a vital nutrient that helps the body absorb, repair and rebuild after a workout. It is also known as a "vitamin" and is responsible for building and repairing cells, proteins, fats, carbohydrates, lactose, cholesterol, and more.

Question: What is the benefit of taking Creatine?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Response: What is the benefit of taking Creatine? A: Creatination is a powerful stimulant that helps you focus, focus and focus on your work, your life and your goals. Creatinine helps your body produce nitric oxide, which helps to rebuild and rebuild muscle after intense training. Nitrico acids help your muscles relax and help you recover faster.

Question: How does Vitamin D work?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Response: How does Vitamin D work?

Vitamin D is a vital nutrient that helps the body absorb, repair and rebuild after a workout. It helps your body recover faster and more efficiently.
Q: What is the exercise Dumbbell Seated Tricep Extension? A: No description available

Question: What is the benefit of taking Whey Protein?
Response: What is the benefit of taking Whey Protein? A: Wheymakes are the most complete source of protein in the world, containing 90% protein, with a higher concentration of leucine, valine and iodine. Wheys are rich in essential amino acids, which are necessary for growth and repair of muscles, and are also known to support cardiovascular health and immunity.



Baseline Performance of GPT-2

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.metrics import accuracy_score

# Step 2: Load GPT-2 tokenizer and model for answer generation
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token
model_gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")

# Load datasets
exercises_df = pd.read_csv('/content/fitness_exercises - fitness_exercises.csv')
supplements_df = pd.read_csv('/content/supplements.csv')

# Preprocess data for GPT-2 (Question-Answer pairs)
exercise_qna = []
for _, row in exercises_df.iterrows():
    exercise_name = row['Exercise Name']
    description = row['Description'] if row['Description'] != 'No Description' else 'No description available'
    exercise_qna.append(f"Q: What is the exercise {exercise_name}? A: {description}")

supplement_qna = []
for _, row in supplements_df.iterrows():
    product_title = row['Product Title']
    description = row['Product Description'] if row['Product Description'] != 'No Description' else 'No description available'
    supplement_qna.append(f"Q: What is {product_title}? A: {description}")

# Combine the Q&A pairs into a single list
qna_data = exercise_qna + supplement_qna

# Convert to Hugging Face Dataset
gpt2_dataset = Dataset.from_dict({"text": qna_data})

# Tokenize the dataset for GPT-2
def tokenize_gpt2(examples):
    # Tokenize and shift labels for language modeling (labels = input_ids shifted by one)
    encodings = tokenizer_gpt2(examples['text'], truncation=True, padding=True, max_length=512)
    input_ids = encodings['input_ids']
    labels = input_ids.copy()  # Labels are the same as input_ids
    for i in range(len(input_ids)):
        labels[i][1:] = input_ids[i][:-1]  # Shift the labels by one to the right
    encodings['labels'] = labels
    return encodings

tokenized_gpt2_data = gpt2_dataset.map(tokenize_gpt2, batched=True)

# Function to generate responses using the pre-trained GPT-2 model
def generate_response(input_text):
    inputs = tokenizer_gpt2(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model_gpt2.generate(**inputs, max_length=512, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.9, temperature=0.7)
    generated_text = tokenizer_gpt2.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

test_data = qna_data[:100]

bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for qna in test_data:
    # Split Q&A into question and true answer
    if "A: " in qna:
        question, true_answer = qna.split(" A: ")
        generated_answer = generate_response(question + " A: ")

        # Tokenize both generated and true answers for BLEU calculation
        reference = [true_answer.split()]
        hypothesis = generated_answer.split()

        bleu_scores.append(sentence_bleu(reference, hypothesis, weights=(1, 0, 0, 0)))

        # Calculate ROUGE score
        rouge_score = scorer.score(true_answer, generated_answer)
        rouge1_scores.append(rouge_score["rouge1"].fmeasure)
        rouge2_scores.append(rouge_score["rouge2"].fmeasure)
        rougeL_scores.append(rouge_score["rougeL"].fmeasure)

# Compute average BLEU score
average_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

# Compute average ROUGE scores
average_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
average_rouge2 = sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0
average_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0

print(f"Average BLEU Score for GPT-2 on the test set: {average_bleu}")
print(f"Average ROUGE-1 Score for GPT-2 on the test set: {average_rouge1}")
print(f"Average ROUGE-2 Score for GPT-2 on the test set: {average_rouge2}")
print(f"Average ROUGE-L Score for GPT-2 on the test set: {average_rougeL}")


Map:   0%|          | 0/599 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for ope

Average BLEU Score for GPT-2 on the test set: 0.036697656695797186
Average ROUGE-1 Score for GPT-2 on the test set: 0.09539403228285598
Average ROUGE-2 Score for GPT-2 on the test set: 0.031098241622018276
Average ROUGE-L Score for GPT-2 on the test set: 0.0828527125812272


Baseline Performance of BERT

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
from tqdm import tqdm

# Step 1: Load BERT tokenizer and model for classification (Supplements vs Exercises)
tokenizer_bert = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model_bert = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Load datasets
exercises_df = pd.read_csv('/content/fitness_exercises - fitness_exercises.csv')
supplements_df = pd.read_csv('/content/supplements.csv')

# Step 2: Prepare data for BERT classification (Supplements vs Exercises)
exercises_df['label'] = 1  # Exercise label
supplements_df['label'] = 0  # Supplements label

# Combine datasets and convert to Hugging Face Dataset
combined_df = pd.concat([exercises_df[['Description', 'label']], supplements_df[['Product Description', 'label']]])

# Clean invalid or missing descriptions
combined_df['Description'] = combined_df['Description'].apply(lambda x: 'No description available' if x == 'No Description' or pd.isnull(x) else x)
combined_df['Product Description'] = combined_df['Product Description'].apply(lambda x: 'No description available' if x == 'No Description' or pd.isnull(x) else x)

# Rename columns for uniformity
combined_df = combined_df.rename(columns={'Description': 'text'})  # Rename columns for uniformity

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(combined_df)

# Step 3: Split the dataset into train and test
train_test_split = dataset.train_test_split(test_size=0.2)

# Step 4: Tokenize the dataset for BERT
def tokenize_function(examples):
    return tokenizer_bert(examples['text'], truncation=True, padding=True, max_length=512)

# Map the tokenization function to the dataset
tokenized_data = train_test_split.map(tokenize_function, batched=True)

# Step 5: Prepare DataLoader
def create_dataloader(dataset, batch_size=8):
    # Convert dataset to a PyTorch DataLoader with proper padding and labels
    return DataLoader(dataset, batch_size=batch_size, collate_fn=lambda x: {
        'input_ids': torch.stack([torch.tensor(i['input_ids']) for i in x]),
        'attention_mask': torch.stack([torch.tensor(i['attention_mask']) for i in x]),
        'label': torch.tensor([i['label'] for i in x])
    })

# Create DataLoader for the test set
test_dataloader = create_dataloader(tokenized_data['test'])

# Step 6: Evaluate BERT on the test set
def evaluate_bert(model, dataloader):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    # Loop over the dataset in batches
    for batch in tqdm(dataloader, desc="Evaluating BERT"):
        inputs = {key: value.to(model.device) for key, value in batch.items() if key != 'label'}
        labels = batch['label'].to(model.device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits  # Get logits (raw outputs)

        # Get predicted labels (0 or 1) from logits
        preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    # Compute accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    return accuracy

# Step 7: Run evaluation on the test set
accuracy = evaluate_bert(model_bert, test_dataloader)
print(f"Accuracy for BERT (classification): {accuracy}")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/479 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Evaluating BERT: 100%|██████████| 15/15 [00:05<00:00,  2.78it/s]

Accuracy for BERT (classification): 0.25



