In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [3]:
# Import necessary libraries
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [4]:
df = pd.read_csv('/content/drive/MyDrive/P2/LLM/LLL-Advice/STE/br_health.csv')

In [5]:
# Define the task format for T5
def format_for_t5(row):
    input_text = f"Compare: {row['cause']} | Response: {row['bad_response']}"
    target_text = row['missing_point']
    return input_text, target_text

# Preprocess the dataset
data = df.apply(format_for_t5, axis=1)
inputs, targets = zip(*data)

# Split into training and testing sets
train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.2, random_state=42)


In [6]:
# Define a custom dataset for PyTorch
class T5Dataset(Dataset):
    def __init__(self, tokenizer, inputs, targets, max_length=512):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = self.inputs[index]
        target_text = self.targets[index]

        # Tokenize inputs and targets
        input_enc = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        target_enc = self.tokenizer(target_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": input_enc["input_ids"].squeeze(),
            "attention_mask": input_enc["attention_mask"].squeeze(),
            "labels": target_enc["input_ids"].squeeze()
        }


In [7]:
# Initialize T5 tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Create datasets
train_dataset = T5Dataset(tokenizer, train_inputs, train_targets)
test_dataset = T5Dataset(tokenizer, test_inputs, test_targets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Define a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Fine-tune the model
trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.1178,0.096317
2,0.0857,0.065035
3,0.069,0.057396
4,0.0643,0.053627
5,0.0679,0.051852
6,0.0553,0.051274


TrainOutput(global_step=1242, training_loss=0.2770177799122537, metrics={'train_runtime': 262.2243, 'train_samples_per_second': 37.868, 'train_steps_per_second': 4.736, 'total_flos': 1343944088616960.0, 'train_loss': 0.2770177799122537, 'epoch': 6.0})

In [9]:
# Test the model on a new example
def evaluate(input_cause, input_response):
    input_text = f"Compare: {input_cause} | Response: {input_response}"
    input_enc = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to("cuda")
    output = model.generate(input_enc["input_ids"], max_length=50, num_beams=5, early_stopping=True)
    result = tokenizer.decode(output[0], skip_special_tokens=True)
    return result

# Example usage
cause = "Malaria can occur if a mosquito infected with malaria parasites bites you."
response = "Malaria occurs when a mosquito bites and transmits disease."
print("Missing Points:", evaluate(cause, response))

Missing Points: Omits other causes like bites or transmission of disease.


In [10]:
# Step 6: Save the Fine-Tuned Model
model.save_pretrained("/content/drive/MyDrive/P2/LLM/LLL-Advice/fine_tuned/fine_tuned_ste")
tokenizer.save_pretrained("/content/drive/MyDrive/P2/LLM/LLL-Advice/fine_tuned/fine_tuned_ste")

('/content/drive/MyDrive/P2/LLM/LLL-Advice/fine_tuned/fine_tuned_ste/tokenizer_config.json',
 '/content/drive/MyDrive/P2/LLM/LLL-Advice/fine_tuned/fine_tuned_ste/special_tokens_map.json',
 '/content/drive/MyDrive/P2/LLM/LLL-Advice/fine_tuned/fine_tuned_ste/spiece.model',
 '/content/drive/MyDrive/P2/LLM/LLL-Advice/fine_tuned/fine_tuned_ste/added_tokens.json')

# Evaluation on LLM's response

In [11]:
# Load the CSV file with LLM responses
llm_responses_path = '/content/drive/MyDrive/P2/LLM/LLL-Advice/Llama_advice.csv'
llm_df = pd.read_csv(llm_responses_path)

# Function to evaluate each response
def evaluate_llm_responses(df, model, tokenizer):
    evaluated_responses = []

    for _, row in df.iterrows():
        cause = "Malaria can occur if a mosquito infected with malaria parasites bites you."  # Example cause, replace with specific disease causes
        response = row['response_cause']
        input_text = f"Compare: {cause} | Response: {response}"
        input_enc = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to("cuda")
        output = model.generate(input_enc["input_ids"], max_length=50, num_beams=5, early_stopping=True)
        result = tokenizer.decode(output[0], skip_special_tokens=True)
        evaluated_responses.append(result)

    df['missing_points'] = evaluated_responses
    return df

# Evaluate the responses
evaluated_llm_df = evaluate_llm_responses(llm_df, model, tokenizer)

# Save the results to a new CSV
output_path = '/content/drive/MyDrive/P2/LLM/LLL-Advice/STE_marks.csv'
evaluated_llm_df.to_csv(output_path, index=False)

print(f"Evaluation complete. Results saved to {output_path}")


Evaluation complete. Results saved to /content/drive/MyDrive/P2/LLM/LLL-Advice/STE_marks.csv
