In [6]:
import torch
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

In [7]:
# Paths (Colab default paths where files should be uploaded)
model_dir = '/content/'
val_csv_path = '/content/mentalchat16k_validation.csv'   # Ensure this matches your uploaded filename!

In [8]:
# Load tokenizer and fine-tuned model
tokenizer = BlenderbotTokenizer.from_pretrained(model_dir)
model = BlenderbotForConditionalGeneration.from_pretrained(model_dir)

# Move model to GPU â€” Colab will use T4 GPU if enabled in runtime settings
model = model.to("cuda")
model.eval()
print("Model loaded on GPU (T4)")

The module name  (originally ) is not a valid Python identifier. Please rename the original module to avoid import issues.


Model loaded on GPU (T4)


In [9]:
# Load validation data
val_df = pd.read_csv(val_csv_path)
print(f"Validation dataset size: {len(val_df)} samples")

texts = val_df['text']
targets = val_df['target']

Validation dataset size: 2397 samples


In [10]:
# Batch size for inference
batch_size = 8
predictions = []

for i in range(0, len(texts), batch_size):
    batch_texts = list(texts[i:i+batch_size])
    inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to("cuda")
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=64)
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    predictions.extend(decoded_preds)

val_df['prediction'] = predictions


In [11]:
# Evaluation metrics
y_true = [t.lower().strip() for t in targets]
y_pred = [p.lower().strip() for p in predictions]

exact_match = sum(t == p for t, p in zip(y_true, y_pred)) / len(y_true)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)

print(f"Exact Match Accuracy: {exact_match:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Save results
val_df.to_csv('/content/mentalchat16k_val_predictions.csv', index=False)
print("Predictions saved to /content/mentalchat16k_val_predictions.csv")

# Display a few sample results
print(val_df.sample(5)[['text', 'target', 'prediction']])

Exact Match Accuracy: 0.0000
Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000
Predictions saved to /content/mentalchat16k_val_predictions.csv
                                                   text  \
636   you are a helpful mental health counselling as...   
862   you are a helpful mental health counselling as...   
1595  you are a helpful mental health counselling as...   
312   you are a helpful mental health counselling as...   
2117  you are a helpful mental health counselling as...   

                                                 target  \
636   im glad that you feel a sense of relief and th...   
862   its great that youve reached out for counselin...   
1595  experiencing excessive worrying and fear that ...   
312   in situations where you experience intense anx...   
2117  your feelings are valid and its important to a...   

                                             prediction  
636    im glad to hear that you feel supported and t...  
862    it sounds like youre g

In [12]:
!pip install rouge_score

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(str(tgt), str(pred))['rougeL'].fmeasure for tgt, pred in zip(targets, predictions)]
avg_rougeL = sum(rouge_scores) / len(rouge_scores)
print(f"Average ROUGE-L F1: {avg_rougeL:.4f}")


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c91f4caa23fbeece3af1994471abe6c9688bb081e05a60b652c92ecc90a3b982
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Average ROUGE-L F1: 0.1328


In [13]:
# Load test dataset
test_csv_path = '/content/mentalchat16k_test.csv'
test_df = pd.read_csv(test_csv_path)
print(f"Test dataset size: {len(test_df)} samples")

Test dataset size: 2397 samples


In [14]:
test_texts = test_df['text']
test_targets = test_df['target']

# Inference for test dataset using previously loaded tokenizer and model
batch_size = 8
test_predictions = []

In [15]:
for i in range(0, len(test_texts), batch_size):
    batch_texts = list(test_texts[i:i + batch_size])
    inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to("cuda")
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=64)
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    test_predictions.extend(decoded_preds)

test_df['prediction'] = test_predictions


In [16]:
# Evaluation metrics for test set
y_true_test = [t.lower().strip() for t in test_targets]
y_pred_test = [p.lower().strip() for p in test_predictions]

exact_match_test = sum(t == p for t, p in zip(y_true_test, y_pred_test)) / len(y_true_test)

from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores_test = [scorer.score(t, p)['rougeL'].fmeasure for t, p in zip(test_targets, test_predictions)]
avg_rougeL_test = sum(rouge_scores_test) / len(rouge_scores_test)

print(f"Test Exact Match Accuracy: {exact_match_test:.4f}")
print(f"Test Average ROUGE-L F1: {avg_rougeL_test:.4f}")

# Save test predictions
test_df.to_csv('/content/mentalchat16k_test_predictions.csv', index=False)
print("Test predictions saved to /content/mentalchat16k_test_predictions.csv")

print(test_df.sample(5)[['text', 'target', 'prediction']])

Test Exact Match Accuracy: 0.0000
Test Average ROUGE-L F1: 0.1323
Test predictions saved to /content/mentalchat16k_test_predictions.csv
                                                   text  \
1821  you are a helpful mental health counselling as...   
1553  you are a helpful mental health counselling as...   
722   you are a helpful mental health counselling as...   
1819  you are a helpful mental health counselling as...   
50    you are a helpful mental health counselling as...   

                                                 target  \
1821  managing social anxiety can be challenging but...   
1553  its great that you recognize the impact of str...   
722   finding healthier ways to deal with stress is ...   
1819  it sounds like youre dealing with a lot right ...   
50    i can only imagine how difficult it must be fo...   

                                             prediction  
1821   it sounds like youve been experiencing a lot ...  
1553   it sounds like stress has been 

In [None]:
# import torch
# from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration

# model_dir = 'blenderbot_mentalhealth_finetuned'
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# tokenizer = BlenderbotTokenizer.from_pretrained(model_dir)
# model = BlenderbotForConditionalGeneration.from_pretrained(model_dir).to(device)
# model.eval()

# print(f"Model loaded on {device}")

# print("Start chatting with the bot! Type 'exit' to quit.")

# while True:
#     user_input = input("You: ")
#     if user_input.lower() == 'exit':
#         print("Exiting chat.")
#         break
#     inputs = tokenizer(user_input, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
#     with torch.no_grad():
#         generated_ids = model.generate(
#             **inputs,
#             max_new_tokens=64,
#             do_sample=True,
#             top_k=50,
#             top_p=0.95,
#             temperature=0.8
#         )
#     response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
#     print(f"Bot: {response}")



model/evaluate_val.ipynb
import pandas as pd
import torch
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
from sklearn.metrics import precision_recall_fscore_support
from rouge_score import rouge_scorer

model_dir = './model/blenderbot_mentalhealth_finetuned'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BlenderbotTokenizer.from_pretrained(model_dir)
model = BlenderbotForConditionalGeneration.from_pretrained(model_dir).to(device)
model.eval()

val_df = pd.read_csv('data/mentalchat16k_validation.csv')

texts = val_df['text']
targets = val_df['target']
batch_size = 8
predictions = []

for i in range(0, len(texts), batch_size):
    batch_texts = list(texts[i:i+batch_size])
    inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=64, do_sample=True, top_k=50, top_p=0.95, temperature=0.8)
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    predictions.extend(decoded_preds)

val_df['prediction'] = predictions

y_true = [t.lower().strip() for t in targets]
y_pred = [p.lower().strip() for p in predictions]

exact_match = sum(t == p for t, p in zip(y_true, y_pred)) / len(y_true)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)

print(f"Exact Match Accuracy: {exact_match:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(t, p)['rougeL'].fmeasure for t, p in zip(targets, predictions)]
avg_rougeL = sum(rouge_scores) / len(rouge_scores)
print(f"Average ROUGE-L F1: {avg_rougeL:.4f}")

val_df.to_csv('data/mentalchat16k_val_predictions.csv', index=False)
print("Validation predictions saved at data/mentalchat16k_val_predictions.csv")


model/evaluate_test.ipynb
import pandas as pd
import torch
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
from rouge_score import rouge_scorer

model_dir = './model/blenderbot_mentalhealth_finetuned'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BlenderbotTokenizer.from_pretrained(model_dir)
model = BlenderbotForConditionalGeneration.from_pretrained(model_dir).to(device)
model.eval()

test_df = pd.read_csv('data/mentalchat16k_test.csv')

texts = test_df['text']
targets = test_df['target']
batch_size = 8
predictions = []

for i in range(0, len(texts), batch_size):
    batch_texts = list(texts[i:i+batch_size])
    inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        generated_ids = model.generate(**inputs, max_new_tokens=64, do_sample=True, top_k=50, top_p=0.95, temperature=0.8)
    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    predictions.extend(decoded_preds)

test_df['prediction'] = predictions

y_true = [t.lower().strip() for t in targets]
y_pred = [p.lower().strip() for p in predictions]

exact_match = sum(t == p for t, p in zip(y_true, y_pred)) / len(y_true)

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(t, p)['rougeL'].fmeasure for t, p in zip(targets, predictions)]
avg_rougeL = sum(rouge_scores) / len(rouge_scores)

print(f"Test Exact Match Accuracy: {exact_match:.4f}")
print(f"Test Average ROUGE-L F1: {avg_rougeL:.4f}")

test_df.to_csv('data/mentalchat16k_test_predictions.csv', index=False)
print("Test set predictions saved at data/mentalchat16k_test_predictions.csv")
