COSE dataset

Start

In [None]:
from datasets import load_dataset

# Load the CoS-E dataset
dataset = load_dataset('cos_e', 'v1.11')

In [None]:
dataset['train']

In [None]:
# Display an example
example = dataset['train']
print(f"Question: {example['question'][0]}")
print(example['choices'][0])
print(f"Answer: {example['answer'][0]}")
print(f"Explanation: {example['abstractive_explanation'][0]}")

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [None]:
# Load the CoS-E dataset
dataset = load_dataset('cos_e', 'v1.11')

In [None]:
# Initialize the model and tokenizer
model_name = "t5-base"  # Use "t5-base" or "t5-large" if resources allow
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
examples=dataset['validation']

In [None]:
for q, choices in zip(examples['question'], examples['choices']):
    print(q)
    print(choices)
    break

In [None]:
x=["question: " + q + " answer: " + " ".join(choices) for q, choices in zip(examples['question'], examples['choices'])]

In [None]:
examples['answer'][0]

In [None]:
examples['abstractive_explanation'][0]

In [None]:
x[0]

In [None]:
"answer: " + examples['answer'][0] + " explanation: " + examples['abstractive_explanation'][0]

In [None]:
y = ["answer: " + answer + " explanation: " + explanation for answer, explanation in zip(examples['answer'], examples['abstractive_explanation'])]

In [None]:
y[0]

In [None]:
# Preprocess the dataset
def preprocess_function(examples):
    inputs = ["question: " + q + " answer: " + " ".join(choices) for q, choices in zip(examples['question'], examples['choices'])]
    targets = ["answer: " + answer + " explanation: " + explanation for answer, explanation in zip(examples['answer'], examples['abstractive_explanation'])]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
encoded_dataset['validation']

In [None]:
encoded_dataset['validation']['labels'][0]

In [None]:
tokenizer.batch_decode(encoded_dataset['validation']['labels'][0], skip_special_tokens=True)

In [None]:
from torch.utils.data import DataLoader

# Convert dataset to PyTorch tensors
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create DataLoaders
train_loader = DataLoader(encoded_dataset['train'], batch_size=16, shuffle=True)
val_loader = DataLoader(encoded_dataset['validation'], batch_size=16)

In [None]:
# # Create DataLoaders
# train_dataset = encoded_dataset['train']
# val_dataset = encoded_dataset['validation']
# train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=4)

In [None]:
# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Training loop
def train_loop(model, loader, optimizer, accumulation_steps=2):
    model.train()
    batch_losses = []
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(loader, desc='Training:')):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
        loss = outputs.loss / accumulation_steps  # normalize loss

        batch_loss_value = loss.item() * accumulation_steps  # convert to original loss value for logging
        loss.backward()

        if (i + 1) % accumulation_steps == 0:  # update weights every accumulation_steps mini-batches
            optimizer.step()
            optimizer.zero_grad()  # reset gradients

        batch_losses.append(batch_loss_value)

    # Update remaining gradients if the number of batches is not a multiple of accumulation_steps
    if len(loader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    loss_value = sum(batch_losses) / len(batch_losses)
    return {'train_loss': loss_value}

# # Validation loop
# def validate_loop(model, loader):
#     model.eval()
#     batch_losses = []

#     with torch.no_grad():
#         for batch in tqdm(loader, desc='Validation:'):
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
#             loss = outputs.loss

#             batch_losses.append(loss.item())

#     loss_value = sum(batch_losses) / len(batch_losses)
#     return {'val_loss': loss_value}

***************

In [None]:
# def validate_loop(model, loader):
#     model.eval()
#     batch_losses = []

#     with torch.no_grad():
#         for batch in tqdm(loader, desc='Validation:'):
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
#             loss = outputs.loss

#             # Generate predictions
#             predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)

#             # Decode predictions
#             decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#             decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#             # Print decoded predictions and labels for debugging
#             print(f"Decoded predictions: {decoded_preds}")
#             print(f"Decoded labels: {decoded_labels}")

#             batch_losses.append(loss.item())

#     loss_value = sum(batch_losses) / len(batch_losses)
#     return {'val_loss': loss_value}

Add accuracy validation

In [None]:
# import numpy as np
# from tqdm import tqdm
# from datasets import load_metric

# # Load metrics
# accuracy_metric = load_metric("accuracy")

# def validate_loop(model, loader):
#     model.eval()
#     batch_losses = []
#     accuracy_preds = []
#     accuracy_labels = []

#     with torch.no_grad():
#         for batch in tqdm(loader, desc='Validation:'):
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
#             loss = outputs.loss

#             # Generate predictions
#             predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)

#             # Decode predictions and labels
#             decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#             decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#             # Print decoded predictions and labels for debugging
#             print(f"Decoded predictions: {decoded_preds}")
#             print(f"Decoded labels: {decoded_labels}")

#             # Extract the answers from decoded predictions and labels
#             extracted_preds = [pred.split('answer: ')[1].split(' ')[0] for pred in decoded_preds if 'answer: ' in pred]
#             extracted_labels = [label.split('answer: ')[1].split(' ')[0] for label in decoded_labels if 'answer: ' in label]
#             print(f"extracted_preds: {extracted_preds}")
#             print(f"extracted_labels:  {extracted_labels}")
#             # Ensure lengths match for accuracy calculation
#             if len(extracted_preds) == len(extracted_labels):
#                 accuracy_preds.extend(extracted_preds)
#                 accuracy_labels.extend(extracted_labels)

#             batch_losses.append(loss.item())

#     # Calculate accuracy
#     accuracy = accuracy_metric.compute(predictions=accuracy_preds, references=accuracy_labels)
#     loss_value = sum(batch_losses) / len(batch_losses)

#     return {'val_loss': loss_value, 'accuracy': accuracy['accuracy']}

Add accuracy version 2(customized acc metric)

In [None]:
def validate_loop(model, loader):
    model.eval()
    batch_losses = []
    accuracy_preds = []
    accuracy_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Validation:'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
            loss = outputs.loss

            # Generate predictions
            predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)

            # Decode predictions and labels
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Print decoded predictions and labels for debugging
            # print("Decoded predictions:")
            # for pred in decoded_preds:
            #     print(f"'{pred}'")
            # print("Decoded labels:")
            # for label in decoded_labels:
            #     print(f"'{label}'")

            # # Extract the answers from decoded predictions and labels
            # extracted_preds = [pred.strip().split('.')[0] for pred in decoded_preds if pred.strip()]
            # extracted_labels = [label.strip().split('.')[0] for label in decoded_labels if label.strip()]
            # Extract the answers from decoded predictions and labels
            extracted_preds = [pred.split('answer: ')[1].split(' ')[0] for pred in decoded_preds if 'answer: ' in pred]
            extracted_labels = [label.split('answer: ')[1].split(' ')[0] for label in decoded_labels if 'answer: ' in label]
            # Ensure lengths match for accuracy calculation
            if len(extracted_preds) == len(extracted_labels):
                accuracy_preds.extend(extracted_preds)
                accuracy_labels.extend(extracted_labels)

            batch_losses.append(loss.item())

    # Calculate accuracy
    correct = sum(p == l for p, l in zip(accuracy_preds, accuracy_labels))
    accuracy = correct / len(accuracy_preds) if accuracy_preds else 0.0
    loss_value = sum(batch_losses) / len(batch_losses)

    return {'val_loss': loss_value, 'accuracy': accuracy}

# Training and validation
num_epochs = 3
for epoch in range(num_epochs):
    train_metrics = train_loop(model, train_loader, optimizer)
    val_metrics = validate_loop(model, val_loader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_metrics['train_loss']:.4f}")
    print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
    print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")

Add bert score for explanation

In [None]:
# import bert_score
# from datasets import load_metric

# # Load metrics
# accuracy_metric = load_metric("accuracy")

# # Training loop
# def train_loop(model, loader, optimizer, accumulation_steps=16):
#     model.train()
#     batch_losses = []
#     optimizer.zero_grad()

#     for i, batch in enumerate(tqdm(loader, desc='Training:')):
#         inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#         labels = batch['labels'].to(device)

#         outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
#         loss = outputs.loss / accumulation_steps  # normalize loss

#         batch_loss_value = loss.item() * accumulation_steps  # convert to original loss value for logging
#         loss.backward()

#         if (i + 1) % accumulation_steps == 0:  # update weights every accumulation_steps mini-batches
#             optimizer.step()
#             optimizer.zero_grad()  # reset gradients

#         batch_losses.append(batch_loss_value)

#     # Update remaining gradients if the number of batches is not a multiple of accumulation_steps
#     if len(loader) % accumulation_steps != 0:
#         optimizer.step()
#         optimizer.zero_grad()

#     loss_value = sum(batch_losses) / len(batch_losses)
#     return {'train_loss': loss_value}

# # Validation loop
# def validate_loop(model, loader):
#     model.eval()
#     batch_losses = []
#     accuracy_preds = []
#     accuracy_labels = []
#     explanations_preds = []
#     explanations_labels = []

#     with torch.no_grad():
#         for batch in tqdm(loader, desc='Validation:'):
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
#             loss = outputs.loss

#             # Generate predictions
#             predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)

#             # Decode predictions and labels
#             decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#             decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#             # Print decoded predictions and labels for debugging
#             print("Decoded predictions:")
#             for pred in decoded_preds:
#                 print(f"'{pred}'")
#             print("Decoded labels:")
#             for label in decoded_labels:
#                 print(f"'{label}'")

#             # Extract the answers from decoded predictions and labels
#             # extracted_preds = [pred.strip().split('.')[0] for pred in decoded_preds if pred.strip()]
#             # extracted_labels = [label.strip().split('.')[0] for label in decoded_labels if label.strip()]

#             extracted_preds = [pred.split('answer: ')[1].split(' ')[0] for pred in decoded_preds if 'answer: ' in pred]
#             extracted_labels = [label.split('answer: ')[1].split(' ')[0] for label in decoded_labels if 'answer: ' in label]
            
#             # Ensure lengths match for accuracy calculation
#             if len(extracted_preds) == len(extracted_labels):
#                 accuracy_preds.extend(extracted_preds)
#                 accuracy_labels.extend(extracted_labels)

#             # Collect explanations for BERTScore
            
#             # explanations_preds.extend(decoded_preds)
#             # explanations_labels.extend(decoded_labels)
#             explanations_preds = [pred.split('explanation: ')[1].split(' ')[0] for pred in decoded_preds if 'explanation: ' in pred]
#             explanations_labels = [label.split('explanation: ')[1].split(' ')[0] for label in decoded_labels if 'explanation: ' in label]

#             batch_losses.append(loss.item())

#     # Calculate accuracy
#     correct = sum(p == l for p, l in zip(accuracy_preds, accuracy_labels))
#     accuracy = correct / len(accuracy_preds) if accuracy_preds else 0.0
#     loss_value = sum(batch_losses) / len(batch_losses)

#     # Calculate BERTScore
#     P, R, F1 = bert_score.score(explanations_preds, explanations_labels, lang="en", rescale_with_baseline=True)
#     bertscore_avg = F1.mean().item()

#     return {'val_loss': loss_value, 'accuracy': accuracy, 'bertscore': bertscore_avg}

# # Training and validation
# num_epochs = 3
# for epoch in range(num_epochs):
#     train_metrics = train_loop(model, train_loader, optimizer)
#     val_metrics = validate_loop(model, val_loader)

#     print(f"Epoch {epoch + 1}/{num_epochs}")
#     print(f"Train Loss: {train_metrics['train_loss']:.4f}")
#     print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
#     print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
#     print(f"Validation BERTScore: {val_metrics['bertscore']:.4f}")

above code has problem,decode predictions has empty

In [None]:

# # Validation loop with BERTScore for explanations and predictions
# def validate_loop(model, loader):
#     model.eval()
#     batch_losses = []
#     accuracy_preds = []
#     accuracy_labels = []
#     explanations_preds = []
#     explanations_labels = []

#     with torch.no_grad():
#         for batch in tqdm(loader, desc='Validation:'):
#             inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
#             labels = batch['labels'].to(device)
#             outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
#             loss = outputs.loss

#             # Generate predictions
#             predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)

#             # Decode predictions and labels
#             decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#             decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#             # Print decoded predictions and labels for debugging
#             print("Decoded predictions:")
#             for pred in decoded_preds:
#                 print(f"'{pred}'")
#             print("Decoded labels:")
#             for label in decoded_labels:
#                 print(f"'{label}'")

#             # Extract the answers from decoded predictions and labels
#             extracted_preds = [pred.split('answer: ')[1].split(' ')[0] for pred in decoded_preds if 'answer: ' in pred]
#             extracted_labels = [label.split('answer: ')[1].split(' ')[0] for label in decoded_labels if 'answer: ' in label]

#             # Ensure lengths match for accuracy calculation
#             if len(extracted_preds) == len(extracted_labels):
#                 accuracy_preds.extend(extracted_preds)
#                 accuracy_labels.extend(extracted_labels)

#             # Collect explanations for BERTScore
#             # explanations_preds.extend(decoded_preds)
#             # explanations_labels.extend(decoded_labels)
#             explanations_preds = [pred.split('explanation: ')[1].split(' ')[0] for pred in decoded_preds if 'explanation: ' in pred]
#             explanations_labels = [label.split('explanation: ')[1].split(' ')[0] for label in decoded_labels if 'explanation: ' in label]

#             batch_losses.append(loss.item())

#     # Calculate BERTScore for answers (accuracy_preds and accuracy_labels)
#     P_ans, R_ans, F1_ans = bert_score.score(accuracy_preds, accuracy_labels, lang="en", rescale_with_baseline=True)
#     bertscore_ans_avg = F1_ans.mean().item()

#     # Calculate BERTScore for explanations (explanations_preds and explanations_labels)
#     P_exp, R_exp, F1_exp = bert_score.score(explanations_preds, explanations_labels, lang="en", rescale_with_baseline=True)
#     bertscore_exp_avg = F1_exp.mean().item()

#     loss_value = sum(batch_losses) / len(batch_losses)

#     return {'val_loss': loss_value, 'bertscore_ans': bertscore_ans_avg, 'bertscore_exp': bertscore_exp_avg}

# # Training and validation
# num_epochs = 3
# for epoch in range(num_epochs):
#     train_metrics = train_loop(model, train_loader, optimizer)
#     val_metrics = validate_loop(model, val_loader)

#     print(f"Epoch {epoch + 1}/{num_epochs}")
#     print(f"Train Loss: {train_metrics['train_loss']:.4f}")
#     print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
#     print(f"Validation BERTScore (Answers): {val_metrics['bertscore_ans']:.4f}")
#     print(f"Validation BERTScore (Explanations): {val_metrics['bertscore_exp']:.4f}")


There is empty sentences in predictions

In [None]:
def convert_to_sentences(list_of_lists):
    sentences = [' '.join(inner_list) for inner_list in list_of_lists]
    return sentences

# Example usage
list_of_lists = [
    ["The", "capital", "of", "France", "is", "Paris."],
    ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog."],
    ["Artificial", "intelligence", "is", "transforming", "the", "world."]
]

sentences = convert_to_sentences(list_of_lists)
print(sentences)

In [None]:
import bert_score
from datasets import load_metric
def validate_loop(model, loader):
    model.eval()
    batch_losses = []
    accuracy_preds = []
    accuracy_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Validation:'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
            loss = outputs.loss

            # Generate predictions
            predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)

            # Decode predictions and labels
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Print decoded predictions and labels for debugging
            # print("Decoded predictions:")
            # for pred in decoded_preds:
            #     print(f"'{pred}'")
            # print("Decoded labels:")
            # for label in decoded_labels:
            #     print(f"'{label}'")

            # # Extract the answers from decoded predictions and labels
            # extracted_preds = [pred.strip().split('.')[0] for pred in decoded_preds if pred.strip()]
            # extracted_labels = [label.strip().split('.')[0] for label in decoded_labels if label.strip()]
            # Extract the answers from decoded predictions and labels
            extracted_preds = [pred.split('answer: ')[1].split(' ')[0] for pred in decoded_preds if 'answer: ' in pred]
            extracted_labels = [label.split('answer: ')[1].split(' ')[0] for label in decoded_labels if 'answer: ' in label]
            
            explanations_preds = [pred.split('explanation: ')[1].split(' ') for pred in decoded_preds if 'explanation: ' in pred]
            explanations_labels = [label.split('explanation: ')[1].split(' ') for label in decoded_labels if 'explanation: ' in label]
            print(convert_to_sentences(explanations_preds))
            print(convert_to_sentences(explanations_labels))
            
            # Ensure lengths match for accuracy calculation
            if len(extracted_preds) == len(extracted_labels):
                accuracy_preds.extend(extracted_preds)
                accuracy_labels.extend(extracted_labels)

            batch_losses.append(loss.item())

    # Calculate accuracy
    correct = sum(p == l for p, l in zip(accuracy_preds, accuracy_labels))
    accuracy = correct / len(accuracy_preds) if accuracy_preds else 0.0
    
    
    # Calculate BERTScore for explanations (explanations_preds and explanations_labels)
    P_exp, R_exp, F1_exp,_= bert_score.score(convert_to_sentences(explanations_preds), convert_to_sentences(explanations_labels), lang="en", rescale_with_baseline=True)
    bertscore_exp_avg = F1_exp.mean().item()

    loss_value = sum(batch_losses) / len(batch_losses)
    # return {'val_loss': loss_value, 'accuracy': accuracy}
    # return {'val_loss': loss_value, 'bertscore_exp': bertscore_exp_avg}
    return {'val_loss': loss_value, 'accuracy': accuracy, 'bertscore_exp': bertscore_exp_avg}


# # Training and validation
# num_epochs = 1
# for epoch in range(num_epochs):
#     train_metrics = train_loop(model, train_loader, optimizer)
#     val_metrics = validate_loop(model, val_loader)

#     print(f"Epoch {epoch + 1}/{num_epochs}")
#     print(f"Train Loss: {train_metrics['train_loss']:.4f}")
#     print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
#     print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
    
# Training and validation
num_epochs = 3
for epoch in range(num_epochs):
    train_metrics = train_loop(model, train_loader, optimizer)
    val_metrics = validate_loop(model, val_loader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_metrics['train_loss']:.4f}")
    print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
    print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Validation BERTScore (Explanations): {val_metrics['bertscore_exp']:.4f}")

End

In [None]:
# def convert_to_sentences(list_of_lists):
#     sentences = [' '.join(inner_list) for inner_list in list_of_lists]
#     return sentences

# # Example usage
# list_of_lists = [
#     ["The", "capital", "of", "France", "is", "Paris."],
#     ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog."],
#     ["Artificial", "intelligence", "is", "transforming", "the", "world."]
# ]

# sentences = convert_to_sentences(list_of_lists)
# print(sentences)


Phase2 <br>
Adding dream to model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("allenai/DREAM") # can take a couple minutes, be patient!
tokenizer = AutoTokenizer.from_pretrained("t5-11b")

Question: "There are 10 apples on an apple tree.  Three fall off.  Now there are X apples."  What is this an example of?
['park', 'coloring book', 'garden center', 'math problem', 'gravity']
Answer: math problem
Explanation: webmath is designed to help you solve

In [None]:
input_string = "$answer$ ; $question$ = [SITUATION] There are 10 apples on an apple tree.  Three fall off.  Now there are X apples. What is this an example of?  math or gravity?. [QUERY] consequence"
input_ids = tokenizer.encode(input_string, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=50)
tokenizer.batch_decode(output, skip_special_tokens=True)

In [None]:
input_string = "$answer$ ; $question$ = [SITUATION] There are 10 apples on an apple tree.  Three fall off.  Now there are X apples. What is this an example of? math or gravity?.  [QUERY] rot"
input_ids = tokenizer.encode(input_string, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=50)
tokenizer.batch_decode(output, skip_special_tokens=True)
# ["$answer$ = It's wrong to damage other people's property."]

Create elaboration from dream model in two ways

In [None]:
dataset

In [None]:
print(dataset['validation']['question'][0])
print(dataset['validation']['choices'][0])

with or of choices

In [None]:
def format_question(example):
    situation = example['question']
    choices = " or ".join(example['choices'])
    input_string = f"$answer$ ; $question$ = [SITUATION] {situation}  {choices} [QUERY] rot"
    example['formatted_question'] = input_string
    return example

# Apply the formatting function to the dataset
formatted_dataset = dataset.map(format_question)

# Print a few examples to verify the formatting
print(formatted_dataset['train'][0]['formatted_question'])
print(formatted_dataset['validation'][0]['formatted_question'])


In [None]:
# input_string = "$answer$ ; $question$ = [SITUATION] There are 10 apples on an apple tree.  Three fall off.  Now there are X apples. What is this an example of? math or gravity?.  [QUERY] rot"

rot dream

In [None]:
formatted_dataset

apply dream to formatted_question

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
def generate_output(example):
    input_string = example['formatted_question']
    input_ids = tokenizer.encode(input_string, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=100)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    example['generated_output'] = generated_text
    return example

In [None]:
formatted_dataset['validation'][0]

In [None]:
example=formatted_dataset['validation'][2]
input_string = example['formatted_question']
input_ids = tokenizer.encode(input_string, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=100)
print(output)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
example['generated_output'] = generated_text
print(example['generated_output'])

In [None]:
example['formatted_question']

In [None]:
# Apply the function to generate outputs
result_dataset = formatted_dataset.map(generate_output)

# Print a few examples to verify the outputs
print(result_dataset['train'][0]['generated_output'])
print(result_dataset['validation'][0]['generated_output'])

In [None]:
# Example to save to a directory named "formatted_dataset"
result_dataset.save_to_disk('./formatted_dataset')

**********************

add dream model text to input of t5 and fine-tuning

### debugging not important

**debugging phase and bleurt score that doesn't work because of google api not work**

def validation loop using bert score for explanation score

In [None]:
# %pip install datasets transformers torch tqdm bert-score bleurt

In [None]:
from datasets import load_dataset, load_metric

In [None]:
# !pip install --upgrade pip  # ensures that pip is current
# !git clone https://github.com/google-research/bleurt.git
# !pip install ./bleurt

In [None]:
!unzip /home/manavi/bleurt-base-128.zip

In [None]:
# bleurt_metric = load("bleurt", "/home/manavi/bleurt-base-128")

In [None]:
# Load metrics
# accuracy_metric = load_metric("accuracy")
# bert_score_metric = load_metric("bertscore")
# bleurt_metric = load_metric("bleurt")

# accuracy_metric = load_metric("accuracy")
# bert_score_metric = load_metric("bertscore", trust_remote_code=True)
# bleurt_metric = load_metric("bleurt",trust_remote_code=True, checkpoint="bleurt-base-128")

# Load metrics
accuracy_metric = load_metric("accuracy")
bert_score_metric = load_metric("bertscore", trust_remote_code=True)

In [None]:
# Training loop
def train_loop(model, loader, optimizer, accumulation_steps=16):
    model.train()
    batch_losses = []
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(loader, desc='Training:')):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
        loss = outputs.loss / accumulation_steps  # normalize loss

        batch_loss_value = loss.item() * accumulation_steps  # convert to original loss value for logging
        loss.backward()

        if (i + 1) % accumulation_steps == 0:  # update weights every accumulation_steps mini-batches
            optimizer.step()
            optimizer.zero_grad()  # reset gradients

        batch_losses.append(batch_loss_value)

    # Update remaining gradients if the number of batches is not a multiple of accumulation_steps
    if len(loader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    loss_value = sum(batch_losses) / len(batch_losses)
    return {'train_loss': loss_value}

# Validation loop
def validate_loop(model, loader):
    model.eval()
    batch_losses = []
    accuracy_preds = []
    accuracy_labels = []
    bertscore_preds = []
    bertscore_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Validation:'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
            loss = outputs.loss

            # Generate predictions
            predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Prepare for accuracy metric
            accuracy_preds.extend(decoded_preds)
            accuracy_labels.extend(decoded_labels)

            # Prepare for BERTScore metric
            bertscore_preds.extend(decoded_preds)
            bertscore_labels.extend(decoded_labels)

            batch_losses.append(loss.item())

    # Compute metrics
    loss_value = sum(batch_losses) / len(batch_losses)
    accuracy = accuracy_metric.compute(predictions=accuracy_preds, references=accuracy_labels)
    bert_score = bert_score_metric.compute(predictions=bertscore_preds, references=bertscore_labels, lang="en")
    bert_score_avg = sum(bert_score['f1']) / len(bert_score['f1'])

    return {
        'val_loss': loss_value,
        'accuracy': accuracy['accuracy'],
        'bert_score': bert_score_avg
    }

In [None]:
# Training loop
def train_loop(model, loader, optimizer, accumulation_steps=16):
    model.train()
    batch_losses = []
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(loader, desc='Training:')):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
        loss = outputs.loss / accumulation_steps  # normalize loss

        batch_loss_value = loss.item() * accumulation_steps  # convert to original loss value for logging
        loss.backward()

        if (i + 1) % accumulation_steps == 0:  # update weights every accumulation_steps mini-batches
            optimizer.step()
            optimizer.zero_grad()  # reset gradients

        batch_losses.append(batch_loss_value)

    # Update remaining gradients if the number of batches is not a multiple of accumulation_steps
    if len(loader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    loss_value = sum(batch_losses) / len(batch_losses)
    return {'train_loss': loss_value}

# Validation loop
def validate_loop(model, loader):
    model.eval()
    batch_losses = []
    accuracy_preds = []
    accuracy_labels = []
    bertscore_preds = []
    bertscore_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Validation:'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
            loss = outputs.loss

            # Generate predictions
            predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Filter out empty strings
            decoded_preds = [pred for pred in decoded_preds if pred.strip()]
            decoded_labels = [label for label in decoded_labels if label.strip()]

            # Prepare for accuracy metric (assuming labels are in the form of text that needs to be converted to integers)
            accuracy_preds.extend([int(pred.split()[1]) for pred in decoded_preds if pred.split()[1].isdigit()])
            accuracy_labels.extend([int(label.split()[1]) for label in decoded_labels if label.split()[1].isdigit()])

            # Prepare for BERTScore metric
            bertscore_preds.extend(decoded_preds)
            bertscore_labels.extend(decoded_labels)

            batch_losses.append(loss.item())

    # Compute metrics
    loss_value = sum(batch_losses) / len(batch_losses)
    accuracy = accuracy_metric.compute(predictions=accuracy_preds, references=accuracy_labels)
    bert_score = bert_score_metric.compute(predictions=bertscore_preds, references=bertscore_labels, lang="en")
    bert_score_avg = sum(bert_score['f1']) / len(bert_score['f1'])

    return {
        'val_loss': loss_value,
        'accuracy': accuracy['accuracy'],
        'bert_score': bert_score_avg
    }

In [None]:
# %pip install git+https://github.com/google-research/bleurt.git

In [None]:
# %pip install git+https://github.com/google-research/bleurt.git'

In [None]:
# %pip install bert_score

In [None]:
# Training and validation
num_epochs = 3
for epoch in range(num_epochs):
    train_metrics = train_loop(model, train_loader, optimizer)
    val_metrics = validate_loop(model, val_loader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_metrics['train_loss']:.4f}")
    print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
    print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Explanation Score: {val_metrics['explanation_score']}")

In [None]:
# Training loop
def train_loop(model, loader, optimizer, accumulation_steps=16):
    model.train()
    batch_losses = []
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(loader, desc='Training:')):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
        loss = outputs.loss / accumulation_steps  # normalize loss

        batch_loss_value = loss.item() * accumulation_steps  # convert to original loss value for logging
        loss.backward()

        if (i + 1) % accumulation_steps == 0:  # update weights every accumulation_steps mini-batches
            optimizer.step()
            optimizer.zero_grad()  # reset gradients

        batch_losses.append(batch_loss_value)

    # Update remaining gradients if the number of batches is not a multiple of accumulation_steps
    if len(loader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    loss_value = sum(batch_losses) / len(batch_losses)
    return {'train_loss': loss_value}

# Validation loop
def validate_loop(model, loader):
    model.eval()
    batch_losses = []
    accuracy_preds = []
    accuracy_labels = []
    bertscore_preds = []
    bertscore_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Validation:'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
            loss = outputs.loss

            # Generate predictions
            predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Filter out empty strings and check lengths
            decoded_preds = [pred for pred in decoded_preds if pred.strip()]
            decoded_labels = [label for label in decoded_labels if label.strip()]

            # Debugging output
            print(f"Decoded predictions: {decoded_preds}")
            print(f"Decoded labels: {decoded_labels}")

            # Prepare for accuracy metric (assuming labels are in the form of text that needs to be converted to integers)
            accuracy_preds.extend([int(pred.split()[1]) for pred in decoded_preds if pred.split()[1].isdigit()])
            accuracy_labels.extend([int(label.split()[1]) for label in decoded_labels if label.split()[1].isdigit()])

            # Prepare for BERTScore metric
            bertscore_preds.extend(decoded_preds)
            bertscore_labels.extend(decoded_labels)

            batch_losses.append(loss.item())

    # Ensure no empty lists before computing metrics
    if not accuracy_preds or not accuracy_labels:
        raise ValueError("Empty list found for accuracy predictions or labels.")
    if not bertscore_preds or not bertscore_labels:
        raise ValueError("Empty list found for BERTScore predictions or labels.")

    # Compute metrics
    loss_value = sum(batch_losses) / len(batch_losses)
    accuracy = accuracy_metric.compute(predictions=accuracy_preds, references=accuracy_labels)
    bert_score = bert_score_metric.compute(predictions=bertscore_preds, references=bertscore_labels, lang="en")
    bert_score_avg = sum(bert_score['f1']) / len(bert_score['f1'])

    return {
        'val_loss': loss_value,
        'accuracy': accuracy['accuracy'],
        'bert_score': bert_score_avg
    }

In [None]:
# Training and validation
num_epochs = 3
for epoch in range(num_epochs):
    train_metrics = train_loop(model, train_loader, optimizer)
    val_metrics = validate_loop(model, val_loader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_metrics['train_loss']:.4f}")
    print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
    print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Explanation Score: {val_metrics['explanation_score']}")

In [None]:
# Training loop
def train_loop(model, loader, optimizer, accumulation_steps=16):
    model.train()
    batch_losses = []
    optimizer.zero_grad()

    for i, batch in enumerate(tqdm(loader, desc='Training:')):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
        loss = outputs.loss / accumulation_steps  # normalize loss

        batch_loss_value = loss.item() * accumulation_steps  # convert to original loss value for logging
        loss.backward()

        if (i + 1) % accumulation_steps == 0:  # update weights every accumulation_steps mini-batches
            optimizer.step()
            optimizer.zero_grad()  # reset gradients

        batch_losses.append(batch_loss_value)

    # Update remaining gradients if the number of batches is not a multiple of accumulation_steps
    if len(loader) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    loss_value = sum(batch_losses) / len(batch_losses)
    return {'train_loss': loss_value}

# Validation loop
def validate_loop(model, loader):
    model.eval()
    batch_losses = []
    accuracy_preds = []
    accuracy_labels = []
    bertscore_preds = []
    bertscore_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc='Validation:'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
            loss = outputs.loss

            # Generate predictions
            predictions = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)
            decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            # Debugging output
            print(f"Decoded predictions: {decoded_preds}")
            print(f"Decoded labels: {decoded_labels}")

            # Filter out empty strings and check lengths
            decoded_preds = [pred for pred in decoded_preds if pred.strip()]
            decoded_labels = [label for label in decoded_labels if label.strip()]

            # Ensure lengths match for accuracy metric
            if len(decoded_preds) != len(decoded_labels):
                print(f"Length mismatch: {len(decoded_preds)} predictions, {len(decoded_labels)} labels")
                continue

            # Prepare for accuracy metric (assuming labels are in the form of text that needs to be converted to integers)
            try:
                accuracy_preds.extend([int(pred.split()[1]) for pred in decoded_preds if len(pred.split()) > 1 and pred.split()[1].isdigit()])
                accuracy_labels.extend([int(label.split()[1]) for label in decoded_labels if len(label.split()) > 1 and label.split()[1].isdigit()])
            except IndexError as e:
                print(f"Index error: {e}")
                continue

            # Prepare for BERTScore metric
            bertscore_preds.extend(decoded_preds)
            bertscore_labels.extend(decoded_labels)

            batch_losses.append(loss.item())

    # Ensure no empty lists before computing metrics
    if not accuracy_preds or not accuracy_labels:
        print(f"Accuracy preds: {accuracy_preds}")
        print(f"Accuracy labels: {accuracy_labels}")
        raise ValueError("Empty list found for accuracy predictions or labels.")
    if not bertscore_preds or not bertscore_labels:
        print(f"BERTScore preds: {bertscore_preds}")
        print(f"BERTScore labels: {bertscore_labels}")
        raise ValueError("Empty list found for BERTScore predictions or labels.")

    # Compute metrics
    loss_value = sum(batch_losses) / len(batch_losses)
    accuracy = accuracy_metric.compute(predictions=accuracy_preds, references=accuracy_labels)
    bert_score = bert_score_metric.compute(predictions=bertscore_preds, references=bertscore_labels, lang="en")
    bert_score_avg = sum(bert_score['f1']) / len(bert_score['f1'])

    return {
        'val_loss': loss_value,
        'accuracy': accuracy['accuracy'],
        'bert_score': bert_score_avg
    }

In [None]:
# Training and validation
num_epochs = 3
for epoch in range(num_epochs):
    train_metrics = train_loop(model, train_loader, optimizer)
    val_metrics = validate_loop(model, val_loader)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_metrics['train_loss']:.4f}")
    print(f"Validation Loss: {val_metrics['val_loss']:.4f}")
    print(f"Validation Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Validation BERTScore: {val_metrics['bert_score']}")