In [1]:
import random
import pandas as pd

# Nutritional ranges
calories_range = {
    "very_low": (10, 100),
    "low": (100, 250),
    "moderate": (250, 400),
    "high": (400, 600)
}

total_fat_range = {
    "very_low": (0, 5),
    "low": (5, 10),
    "moderate": (10, 20),
    "high": (20, 50)
}

carbohydrates_range = {
    "very_low": (5, 20),
    "low": (20, 40),
    "moderate": (40, 70),
    "high": (70, 120)
}

fiber_range = {
    "very_low": (0, 2),
    "low": (2, 5),
    "moderate": (5, 10),
    "high": (10, 15)
}

sugar_range = {
    "very_low": (0, 3),
    "low": (3, 10),
    "moderate": (10, 20),
    "high": (20, 50)
}

protein_range = {
    "very_low": (0, 3),
    "low": (3, 10),
    "moderate": (10, 20),
    "high": (20, 35)
}

# Disease conditions
disease_conditions = {
    "diabetes": {
        "sugar_sensitivity": "high",
        "carb_sensitivity": "high"
    },
    "hypertension": {
        "fat_sensitivity": "high"
    },
    "heart_disease": {
        "fat_sensitivity": "high"
    },
    "obesity": {
        "calorie_sensitivity": "high",
        "sugar_sensitivity": "high",
        "fat_sensitivity": "high"
    }
}

# Enhanced templates for questions
question_templates = [
    "Is this product nutritionally healthy for general consumption?",
    "Would this be suitable for someone looking to maintain a healthy lifestyle?",
    "Analyze the impact of this food item on overall health.",
    "How does this food compare to recommended dietary guidelines?",
    "Is this food item balanced for someone on a controlled diet?",
    "Could this food support long-term health goals?",
    "Should this be consumed regularly for maintaining a healthy diet?"
]

disease_question_templates = [
    "Is this safe for someone with {disease}?",
    "Can someone with {disease} include this in their diet?",
    "Would you recommend this product to someone managing {disease}?",
    "Evaluate the suitability of this product for people with {disease}.",
    "Is this food appropriate for a {disease} diet?",
    "Should someone with {disease} consume this regularly?"
]

# More varied disease management templates
disease_manageable_templates = {
    "diabetes": [
        "This product can be included in a diabetic diet in moderation. The {sugar}g sugar and {carbohydrates}g carbohydrates are manageable.",
        "The {sugar}g sugar and {carbohydrates}g carbohydrates content make this manageable for diabetics if portion sizes are controlled.",
        "With {sugar}g sugar and {carbohydrates}g carbs, this can be a controlled part of a diabetes meal plan."
    ],
    "hypertension": [
        "The {fat}g fat content is moderate and fits well into a blood pressure management plan.",
        "With {fat}g fat, this product can be included in a hypertension-friendly diet if portions are monitored.",
        "This product's {fat}g fat content is manageable for those with hypertension when eaten mindfully."
    ],
    "heart_disease": [
        "The {fat}g fat content is moderate enough for heart patients when consumed in moderation.",
        "With {fat}g fat, this can be part of a heart-healthy diet if portions are controlled.",
        "The {fat}g fat level makes this suitable for heart patients if not consumed in excess."
    ],
    "obesity": [
        "At {calories} calories and {sugar}g sugar, this product can fit into a weight management plan with portion control.",
        "The {calories} calorie content is within manageable limits for a weight-conscious diet, provided portion sizes are controlled.",
        "With {sugar}g sugar and {calories} calories, this could be incorporated into a weight management plan if portions are carefully controlled."
    ]
}

# More diverse healthy/unhealthy assessment templates
explanation_templates = {
    "healthy": [
        "This product offers a well-rounded nutritional profile with {sugar}g sugar, {fat}g fat, and {protein}g protein, making it a good choice for a balanced diet.",
        "Containing {sugar}g sugar, {fat}g fat, and {protein}g protein, this product supports a healthy and balanced lifestyle.",
        "With {sugar}g sugar, {fat}g fat, and {protein}g protein, this is a nutritious option that fits into a healthy meal plan."
    ],
    "unhealthy": [
        "This product’s {sugar}g sugar and {fat}g fat content raises concerns for a balanced diet.",
        "Caution is advised due to the high {sugar}g sugar and {fat}g fat levels, which make this an unsuitable choice for a healthy diet.",
        "The combination of {sugar}g sugar and {fat}g fat makes this product less ideal for someone aiming for a healthy lifestyle."
    ]
}

def generate_product(health_status="moderate", disease=None):
    # Generate nutritional parameters with more diversity (adding wider ranges for variety)
    nutrients = {
        'calories': random.randint(*calories_range[health_status]),
        'fat': random.randint(*total_fat_range[health_status]),  # Add more variation to fat range
        'carbohydrates': random.randint(*carbohydrates_range[health_status]),
        'fiber': random.randint(*fiber_range[health_status]),
        'sugar': random.randint(*sugar_range[health_status]),
        'protein': random.randint(*protein_range[health_status])
    }

    # Generate input text based on whether the question is disease-specific or not
    if disease:
        question = random.choice(disease_question_templates).format(disease=disease)
    else:
        question = random.choice(question_templates)

    input_text = (f"{question} The product contains {nutrients['calories']} calories, "
                  f"{nutrients['fat']}g of fat, {nutrients['carbohydrates']}g of carbohydrates, "
                  f"{nutrients['fiber']}g of fiber, {nutrients['sugar']}g of sugar, "
                  f"and {nutrients['protein']}g of protein.")

    # Determine output based on disease or general health
    if disease:
        sensitivity = disease_conditions[disease]
        is_problematic = False

        for condition, level in sensitivity.items():
            if condition == "sugar_sensitivity" and sugar_range[level][1] < nutrients['sugar']:
                is_problematic = True
            elif condition == "fat_sensitivity" and total_fat_range[level][1] < nutrients['fat']:
                is_problematic = True
            elif condition == "calorie_sensitivity" and calories_range[level][1] < nutrients['calories']:
                is_problematic = True
            if is_problematic:
                output = random.choice(disease_problematic_templates[disease]).format(**nutrients)
                break
        else:
            output = random.choice(disease_manageable_templates[disease]).format(**nutrients)
    else:
        # Use a broader range of general health conditions
        health_status = "healthy" if random.random() < 0.7 else "unhealthy"
        output = random.choice(explanation_templates[health_status]).format(**nutrients)

    return input_text, output

# Generate a more balanced dataset
num_samples = 500000
data = []

for _ in range(num_samples):
    if random.random() < 0.6:  # 60% general health
        health_status = random.choice(["very_low", "low", "moderate", "high"])
        input_text, output_text = generate_product(health_status)
    else:  # 40% disease-specific
        disease = random.choice(list(disease_conditions.keys()))
        input_text, output_text = generate_product(disease=disease)

    data.append({"input": input_text, "output": output_text})

# Convert to DataFrame and save
df = pd.DataFrame(data)
df.to_csv("balanced_nutrition_finetuning_data.csv", index=False)

# Display statistics
print(f"Total samples generated: {len(df)}")
print("\nSample rows:")
print(df.head())
print("\nOutput distribution:")
print(df['output'].value_counts().head(50))


Total samples generated: 500000

Sample rows:
                                               input  \
0  Analyze the impact of this food item on overal...   
1  Could this food support long-term health goals...   
2  Is this safe for someone with diabetes? The pr...   
3  Analyze the impact of this food item on overal...   
4  Should this be consumed regularly for maintain...   

                                              output  
0  Caution is advised due to the high 14g sugar a...  
1  This product offers a well-rounded nutritional...  
2  The 19g sugar and 42g carbohydrates content ma...  
3  This product’s 0g sugar and 4g fat content rai...  
4  This product offers a well-rounded nutritional...  

Output distribution:
output
With 17g fat, this product can be included in a hypertension-friendly diet if portions are monitored.    1593
The 16g fat content is moderate enough for heart patients when consumed in moderation.                   1586
The 14g fat content is moderate enough

In [None]:
# !pip install tensorflow transformers datasets
# !pip install tensorflow-text


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [6]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer
import random

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv("comprehensive_nutrition_dataset.csv")

# Initialize the T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def clean_input(text):
    """
    Clean the input text by removing the question template and keeping only the nutritional information.
    Returns both the cleaned input and the original question.
    """
    # Split at the first occurrence of "The product contains"
    parts = text.split("The product contains", 1)

    if len(parts) == 2:
        question = parts[0].strip()
        nutrition_info = "The product contains" + parts[1].strip()
        return question, nutrition_info
    return text, ""

def preprocess_data(examples):
    # Separate questions and nutritional information
    cleaned_inputs = [clean_input(text) for text in examples["input"]]
    questions = [item[0] for item in cleaned_inputs]
    nutrition_info = [item[1] for item in cleaned_inputs]

    # Create formatted inputs with various question templates
    formatted_inputs = []
    for q, info in zip(questions, nutrition_info):
        # Check if it's a disease-specific question
        if any(disease in q.lower() for disease in ["diabetes", "hypertension", "heart disease", "obesity"]):
            formatted_input = f"nutrition_analysis: {q} {info}"
        else:
            formatted_input = f"health_assessment: {info}"
        formatted_inputs.append(formatted_input)

    # Tokenize the inputs and outputs
    model_inputs = tokenizer(
        formatted_inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Tokenize the outputs
    labels = tokenizer(
        examples["output"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Set the labels
    model_inputs["labels"] = labels["input_ids"]

    # Add additional features that might be useful for training
    model_inputs["question_type"] = ["disease" if any(disease in q.lower() for disease in ["diabetes", "hypertension", "heart disease", "obesity"]) else "general" for q in questions]

    return model_inputs

# Convert the pandas DataFrame to Hugging Face's Dataset format
dataset = Dataset.from_pandas(df)

# Apply the preprocessing function to the dataset
processed_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset.column_names)

# Split the dataset into training and validation sets (80-20 split)
split_dataset = processed_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

# Print some statistics about the dataset
print("\nDataset Statistics:")
print(f"Total examples: {len(dataset)}")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")

# Display a sample of processed data
print("\nSample Processed Input:")
sample_idx = random.randint(0, len(train_dataset)-1)
sample = train_dataset[sample_idx]
decoded_input = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
decoded_output = tokenizer.decode(sample["labels"], skip_special_tokens=True)
print(f"Input: {decoded_input}")
print(f"Output: {decoded_output}")

ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [4]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the pretrained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    evaluation_strategy="epoch",     # Evaluate after each epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=5,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
    logging_dir="./logs",            # Directory for storing logs
    logging_steps=100,               # Log every 100 steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # Pretrained model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training data
    eval_dataset=val_dataset,            # Validation data
    tokenizer=tokenizer                  # Tokenizer
)

# Fine-tune the model
trainer.train()


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0032,0.002473
2,0.0022,0.00193
3,0.0021,0.001892
4,0.002,0.001868
5,0.0021,0.001864


TrainOutput(global_step=30000, training_loss=0.014576254208385944, metrics={'train_runtime': 5852.0579, 'train_samples_per_second': 41.011, 'train_steps_per_second': 5.126, 'total_flos': 3.248203235328e+16, 'train_loss': 0.014576254208385944, 'epoch': 5.0})

In [5]:
# Evaluate the model on the validation dataset
trainer.evaluate()


{'eval_loss': 0.0018639052286744118,
 'eval_runtime': 84.0015,
 'eval_samples_per_second': 142.855,
 'eval_steps_per_second': 17.857,
 'epoch': 5.0}

In [6]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_t5_model")
tokenizer.save_pretrained("./fine_tuned_t5_tokenizer")


('./fine_tuned_t5_tokenizer/tokenizer_config.json',
 './fine_tuned_t5_tokenizer/special_tokens_map.json',
 './fine_tuned_t5_tokenizer/spiece.model',
 './fine_tuned_t5_tokenizer/added_tokens.json')

In [None]:
from transformers import T5ForConditionalGeneration, pipeline, T5Tokenizer

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_t5_model")
tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_t5_tokenizer")

# Initialize a text generation pipeline
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# Example input (nutritional fact)
input_text = "Is this safe for someone with diabetes? The product contains 320 calories, 12g of fat, 55g of carbohydrates, 3g of fiber, 12g of sugar, and 10g of protein."

# Get the model's prediction
output = qa_pipeline(input_text)
print(output[0]["generated_text"])


Device set to use cpu


No, this appears to be manageable for someone with diabetes.
