<a href="https://colab.research.google.com/github/CIS6930-NLP/final_project/blob/main/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup



In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sat Apr  8 23:25:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    45W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Install and import libraries

In [2]:
!pip install --quiet datasets
!pip install --quiet transformers

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFT5ForConditionalGeneration, AutoModelForCausalLM, AutoModelForSeq2SeqLM

## Import dataset and models

### Dataset

Hosted at: https://huggingface.co/datasets/aegrif/CIS6930_DAAGR_Empathetic_Dialogues

In [4]:
dataset = load_dataset('aegrif/CIS6930_DAAGR_Empathetic_Dialogues')
test_dataset = dataset['test']
train_dataset = dataset['train'] ##train - new context



  0%|          | 0/3 [00:00<?, ?it/s]

### GPT2

Hosted at: 

https://huggingface.co/aegrif/CIS6930_DAAGR_GPT2_Emo

https://huggingface.co/aegrif/CIS6930_DAAGR_GPT2_NoEmo

In [5]:
tokenizer_gpt2_emo = AutoTokenizer.from_pretrained("aegrif/CIS6930_DAAGR_GPT2_Emo")
model_gpt2_emo = AutoModelForCausalLM.from_pretrained("aegrif/CIS6930_DAAGR_GPT2_Emo")
tokenizer_gpt2_noemo = AutoTokenizer.from_pretrained("aegrif/CIS6930_DAAGR_GPT2_NoEmo")
model_gpt2_noemo = AutoModelForCausalLM.from_pretrained("aegrif/CIS6930_DAAGR_GPT2_NoEmo")

### T5

Hosted at:

https://huggingface.co/t5-small

https://huggingface.co/aegrif/CIS6930_DAAGR_T5_Emo

https://huggingface.co/aegrif/CIS6930_DAAGR_T5_NoEmo

In [6]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model_t5_emo = AutoModelForSeq2SeqLM.from_pretrained("aegrif/CIS6930_DAAGR_T5_Emo",from_tf=True)
model_t5_noemo = AutoModelForSeq2SeqLM.from_pretrained("aegrif/CIS6930_DAAGR_T5_NoEmo",from_tf=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
All TF 2.0 model weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can alre

## Evalution - Gloria


In [7]:
import string
import re
import pandas as pd
import nltk
import spacy
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

nlp = spacy.load('en_core_web_sm')

def standardize_answer(text):
    # Lower text, remove punctuation, articles and extra whitespace.
    text = re.sub(r'\b(a|an|the)\b', ' ', text)
    text = ' '.join(text.split())
    exclude = set(string.punctuation)
    text = ''.join(ch for ch in text if ch not in exclude)
    text = text.lower()
    return text

#predictions are model predictions
#groud_truth are original response utterances

def calculate_maude_score(utterance):
    # Define empathy lexicon
    empathy_lexicon = {
        "affection": 1,
        "appreciation": 1,
        "approval": 1,
        "care": 1,
        "compassion": 1,
        "gratitude": 1,
        "love": 1,
        "pride": 1,
        "relief": 1,
        "calmness": 0.5,
        "confusion": 0.5,
        "curiosity": 0.5,
        "desire": 0.5,
        "excitement": 0.5,
        "fear": 0.5,
        "happiness": 0.5,
        "hope": 0.5,
        "interest": 0.5,
        "joy": 0.5,
        "nostalgia": 0.5,
        "pain": 0,
        "sadness": 0,
        "disappointment": -1,
        "disapproval": -1,
        "disgust": -1,
        "embarrassment": -1,
        "envy": -1,
        "fear": -1,
        "frustration": -1,
        "guilt": -1,
        "anger": -1,
        "hatred": -1,
        "hostility": -1,
        "irritation": -1,
        "jealousy": -1,
        "rage": -1,
        "shame": -1,
        "skepticism": -1
    }
    
    # Convert utterance to lowercase and remove punctuation
    utterance = re.sub(r'[^\w\s]', '', utterance.lower())
    
    # Split utterance into words
    words = utterance.split()
    
    # Calculate MAUDE score
    maude_score = sum(empathy_lexicon.get(word, 0) for word in words) / len(words)
    
    return maude_score

def calculate_fluency_score(utterance):
    # Tokenize utterance
    doc = nlp(utterance)
    # Calculate average token probability using spaCy's token.prob attribute
    # A lower average token probability indicates a less fluent utterance
    fluency_score = sum([token.prob for token in doc]) / len(doc)
    return fluency_score

# calculate appropriateness score
def calculate_appropriateness_score(utterance):
    # Create a list of inappropriate words or phrases to search for in the utterance
    inappropriate_words = ['hate', 'kill', 'stupid', 'ugly']
    # Tokenize utterance
    doc = nlp(utterance)
    # Check if any inappropriate words or phrases are present in the utterance
    for word in inappropriate_words:
        if word in [token.text.lower() for token in doc]:
            return 0
    # If no inappropriate words or phrases are found, return 1 as the appropriateness score
    return 1

def distinct_words(sentence,n):
    if len(sentence) == 0:
        return 0.0  # Prevent a zero division
    distinct_ngrams = set(nltk.ngrams(sentence.split(), n))
    return len(distinct_ngrams) / len(sentence)

def evaluate(model_name, predictions, ground_truth):
  bleu_1 = []
  bleu_2 = []
  maude = []
  dist_1 = []
  dist_2 = []
  # flu= []
  # appr = []

  # results['Model'].add(model_name)
  eval_res = []
  eval_res.append(model_name)
  smoothie = SmoothingFunction().method4
  
  for pred,gt in zip(predictions, ground_truth):
    pred1 = [pred.split()]
    bleu_1.append(sentence_bleu(pred1, gt.split(), weights=(1, 0, 0, 0),smoothing_function=smoothie))
    bleu_2.append(sentence_bleu(pred1, gt.split(), weights=(0, 1, 0, 0),smoothing_function=smoothie))
    maude.append(calculate_maude_score(pred)) 
    dist_1.append(distinct_words(pred,1))
    dist_2.append(distinct_words(pred, 2))
    # flu.append(calculate_fluency_score(pred))
    # appr.append(calculate_appropriateness_score(pred))
  
  eval_res.append(sum(bleu_1) / len(bleu_1)) 
  eval_res.append(sum(bleu_2) / len(bleu_2)) 
  eval_res.append(sum(maude) / len(maude)) 
  eval_res.append(sum(dist_1) / len(dist_1)) 
  eval_res.append(sum(dist_2) / len(dist_2)) 
  # eval_res.append(sum(flu) / len(flu)) 
  # eval_res.append(sum(appr) / len(appr)) 

  return eval_res

In [None]:
pred = ["this is a dog"]
gt = ["this is the dog"]
res = evaluate('T5', pred, gt)
res

['T5', 0.75, 0.3333333333333333, 0.0, 0.3076923076923077, 0.23076923076923078]

In [None]:
results_test = pd.DataFrame(columns =["Model", "Bleu-1", "Bleu-2","MAUDE", "Dist-1", "Dist-2"])
results_test.loc[len(results_test)] = res
results_test

Unnamed: 0,Model,Bleu-1,Bleu-2,MAUDE,Dist-1,Dist-2
0,T5,0.75,0.333333,0.0,0.307692,0.230769


In [None]:
calculate_maude_score('I am happiness')

0.16666666666666666

In [None]:
calculate_fluency_score("this is a dog.")
doc = nlp("this is a dog")
doc_flu = [token.prob for token in doc]
doc_flu

[-20.0, -20.0, -20.0, -20.0]

## Generating responses 

In [8]:
## input data
input_text = test_dataset['previous_utterance']

## Encoding input data using tokenizers 
gpt2_emo_encoded_query = tokenizer_gpt2_emo(input_text,return_tensors='pt', padding=True, truncation=True, max_length=64)
gpt2_noemo_encoded_query = tokenizer_gpt2_noemo(input_text,return_tensors='pt', padding=True, truncation=True, max_length=64)
t5_encoded_query = tokenizer(input_text,return_tensors='pt', padding=True, truncation=True, max_length=64)



In [None]:
##generate outputs for models
##
### Model 1: GPT2 without emotions
gpt2_noemo_res = []
gpt2_noemo_outs = model_gpt2_noemo.generate(input_ids=gpt2_noemo_encoded_query["input_ids"], 
                        attention_mask=gpt2_noemo_encoded_query["attention_mask"],
                        max_length=64,
                        early_stopping=True)
gpt2_noemo_outs = [tokenizer_gpt2_noemo.decode(ids,skip_special_tokens=True) for ids in gpt2_noemo_outs]
gpt2_noemo_res.extend(gpt2_noemo_outs)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 128, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [None]:
### Model 2: GPT2 with emotions
gpt2_emo_res = []
gpt2_emo_outs = model_gpt2_emo.generate(input_ids=gpt2_emo_encoded_query["input_ids"], 
                        attention_mask=gpt2_emo_encoded_query["attention_mask"],
                        max_length=64,
                        early_stopping=True)
gpt2_emo_outs = [tokenizer_gpt2_emo.decode(ids,skip_special_tokens=True) for ids in gpt2_emo_outs]
gpt2_emo_res.extend(gpt2_emo_outs)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 128, but `max_length` is set to 64. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [9]:
### Model 3: T5 without emotions
t5_noemo_res = []
t5_noemo_outs = model_t5_noemo.generate(input_ids=t5_encoded_query["input_ids"], 
                        attention_mask=t5_encoded_query["attention_mask"],
                        max_length=64,
                        early_stopping=True)
t5_noemo_outs = [tokenizer.decode(ids,skip_special_tokens=True) for ids in t5_noemo_outs]
t5_noemo_res.extend(t5_noemo_outs)

In [10]:
### Model 4: T5 with emotions
t5_emo_res = []
t5_emo_outs = model_t5_emo.generate(input_ids=t5_encoded_query["input_ids"], 
                        attention_mask=t5_encoded_query["attention_mask"],
                        max_length=64,
                        early_stopping=True)
t5_emo_outs = [tokenizer.decode(ids,skip_special_tokens=True) for ids in t5_emo_outs]
t5_emo_res.extend(t5_emo_outs)

### Evaluation results 

In [11]:
ground_truth = test_dataset['utterance']
model_results = pd.DataFrame(columns =["Model", "Bleu-1", "Bleu-2","MAUDE", "Dist-1", "Dist-2"])

# #Model 1 
# model1_name = 'GPT2_noemo'
# result1 = evaluate(model1_name, gpt2_noemo_res, ground_truth)
# model_results.loc[len(model_results)] = result1

# #Model 2 
# model2_name = 'GPT2_emo'
# result2 = evaluate(model2_name, gpt2_emo_res, ground_truth)
# model_results.loc[len(model_results)] = result2

#Model 3 
model3_name = 'T5_noemo'
result3 = evaluate(model3_name, t5_noemo_res, ground_truth)
model_results.loc[len(model_results)] = result3

#Model 3 
model4_name = 'T5_emo'
result4 = evaluate(model4_name, t5_emo_res, ground_truth)
model_results.loc[len(model_results)] = result4

In [12]:
model_results

Unnamed: 0,Model,Bleu-1,Bleu-2,MAUDE,Dist-1,Dist-2
0,T5_noemo,0.121661,0.03541,0.008817,0.213221,0.208173
1,T5_emo,0.122337,0.036172,0.00727,0.213194,0.211361


In [18]:
from google.colab import files
model_results.to_csv("model_results.csv", encoding = 'utf-8-sig')
files.download('model_results.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Evaluation

In [None]:
def evaluate_model(prompt, target, model, tokenizer):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output_ids = model.generate(input_ids, max_length=50, do_sample=True)
    generated_response = tokenizer.decode(output_ids.squeeze(), skip_special_tokens=True)
    
    # Compute BLEU score
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu([target.split()], generated_response.split(), smoothing_function=smoothie)
    
    # Compute perplexity
    input_ids = tokenizer.encode(prompt + generated_response, return_tensors='pt')
    with torch.no_grad():
        loss = model(input_ids, labels=input_ids)[0]
    perplexity = torch.exp(loss)
    
    # Compute self-BLEU score
    self_bleu_scores = []
    for i in range(4):
        generated_output = model.generate(input_ids, max_length=50, do_sample=True)
        self_bleu_scores.append(sentence_bleu([generated_response.split()], tokenizer.decode(generated_output.squeeze(), skip_special_tokens=True).split(), smoothing_function=smoothie))
    self_bleu_score = sum(self_bleu_scores) / len(self_bleu_scores)
    
    return bleu_score, perplexity.item(), self_bleu_score

# Evaluate the model on a subset of the EMPATHETICDIALOGUES dataset
subset = test_dataset[:10]
subset

#for example in subset:
 #   prompt = example['utterance']
  #  target = example['response']
   # bleu_score, perplexity, self_bleu_score = evaluate_model(prompt, target, model, tokenizer)
    #print(f'Prompt: {prompt}')
    #print(f'Target: {target}')
    #print(f'Generated response: {generated_response}')
    #print(f'BLEU score: {bleu_score:.2f}')
    #print(f'Perplexity: {perplexity:.2f}')
    #print(f'Self-BLEU score: {self_bleu_score:.2f}\n')


{'conv_id': ['hit:0_conv:0',
  'hit:0_conv:0',
  'hit:0_conv:0',
  'hit:0_conv:0',
  'hit:0_conv:0',
  'hit:34_conv:69',
  'hit:34_conv:69',
  'hit:34_conv:69',
  'hit:34_conv:69',
  'hit:37_conv:74'],
 'utterance_idx': [1, 2, 3, 4, 5, 1, 2, 3, 4, 1],
 'context': ['guilty',
  'guilty',
  'guilty',
  'guilty',
  'guilty',
  'caring',
  'caring',
  'caring',
  'caring',
  'lonely'],
 'prompt': ['i felt guilty when i was driving home one night and a person tried to fly into my lane, and did not see me. i honked and they swerved back into their lane, slammed on their brakes, and hit the water cones.',
  'i felt guilty when i was driving home one night and a person tried to fly into my lane, and did not see me. i honked and they swerved back into their lane, slammed on their brakes, and hit the water cones.',
  'i felt guilty when i was driving home one night and a person tried to fly into my lane, and did not see me. i honked and they swerved back into their lane, slammed on their brakes, 

##### function to generate responses from the model given a prompt

In [None]:
import torch

def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output_ids = model_t5_emo.generate(input_ids, max_length=50, do_sample=True)
    response = tokenizer.decode(output_ids.squeeze(), skip_special_tokens=True)
    return response

In [None]:
df_test_dataset = test_dataset.to_pandas()
df_test_dataset.head()

Unnamed: 0,conv_id,utterance_idx,context,prompt,utterance,new_context,previous_utterance
0,hit:0_conv:0,1,guilty,i felt guilty when i was driving home one nigh...,yeah about years ago i had a horrifying experi...,disgusted,<|start|>
1,hit:0_conv:0,2,guilty,i felt guilty when i was driving home one nigh...,did you suffer any injuries?,disgusted,yeah about years ago i had a horrifying experi...
2,hit:0_conv:0,3,guilty,i felt guilty when i was driving home one nigh...,no i was not hit. it turned out they were drun...,disgusted,did you suffer any injuries?
3,hit:0_conv:0,4,guilty,i felt guilty when i was driving home one nigh...,why did you feel guilty? people really should ...,disgusted,no i was not hit. it turned out they were drun...
4,hit:0_conv:0,5,guilty,i felt guilty when i was driving home one nigh...,i do not know i was new to driving and had not...,disgusted,why did you feel guilty? people really should ...


In [None]:
# Print the number of rows and columns in the dataset
print(f"Number of rows: {df_test_dataset.shape[0]}, number of columns: {df_test_dataset.shape[1]}")

# Print the number of unique speakers in the dataset
print(f"Number of unique tags: {df_test_dataset['tags'].nunique()}")

# Print the number of unique utterances in the dataset
print(f"Number of unique utterances: {df_test_dataset['utterance'].nunique()}")

Number of rows: 79189, number of columns: 8
Number of unique tags: 10
Number of unique utterances: 77528


In [None]:
# Filter dataset to only include utterances from speakers with high self-reported empathy scores (>=4)
#high_empathy_df = df_test_dataset[df_test_dataset['selfeval'] >= 4]

# Group dataset by speaker and count the number of utterances
utterance_count_df = df_test_dataset.groupby('context').count()['utterance']

# Sort the speaker counts in descending order
utterance_count_df = utterance_count_df.sort_values(ascending=False)



In [None]:
# View the top 10 speakers by utterance count
print(utterance_count_df.head(10))

context
surprised      557
grateful       424
proud          421
sentimental    394
excited        388
annoyed        384
sad            374
disgusted      366
joyful         356
jealous        354
Name: utterance, dtype: int64


### BLEU score

In [None]:
from nltk.translate.bleu_score import sentence_bleu
def compute_bleu_score(prompt, target):
    generated_response = generate_response(prompt)
    return sentence_bleu([target], generated_response)

subset = df_test_dataset[:10]

for example in df_test_dataset:
    print(example)
    prompt = example['prompt']
    target = example['utterance']
    bleu_score = compute_bleu_score(prompt, target)
    print(f'Prompt: {prompt}')
    print(f'Target: {target}')
    print(f'Generated response: {generate_response(prompt)}')
    print(f'BLEU score: {bleu_score}\n')

c
u
c
p
u
n
p


In [None]:
#BLEU score
# Remove unnecessary columns
df = df_test_dataset[['utterance', 'emotion']]

# Preprocess text data
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Remove extra whitespace
    text = re.sub('\s+', ' ', text).strip()
    return text

df['utterance'] = df['utterance'].apply(preprocess_text)

# Split dataset into input and output arrays
X = np.array(df['utterance'])
y_true = np.array(df['utterance'])

# Load trained model
model = load_model('model.h5')

# Predict responses for test set
y_pred = model.predict(X)

# Convert predictions to text
y_pred_text = []
for pred in y_pred:
    pred_text = ' '.join([index_to_word[i] for i in pred])
    y_pred_text.append(pred_text)

# Calculate BLEU score for test set
references = [[true] for true in y_true]
candidates = [[pred] for pred in y_pred_text]
bleu_score = corpus_bleu(references, candidates)

print("BLEU score: ", bleu_score)

KeyError: ignored

In [None]:
#simple implementation of Bleu
# Group dataset by conversation and collect the utterances as lists
utterances_by_conv = df.groupby('conv_id')['utterance'].apply(list).tolist()

# Create a reference list of lists that contains the true responses for each conversation
ref_responses = [[conv[1:]] for conv in utterances_by_conv]

# Create a list of lists that contains the predicted responses for each conversation
# Here, we just predict the first utterance as the response for each conversation
pred_responses = [[conv[0]] for conv in utterances_by_conv]

# Calculate the BLEU score for the predicted responses compared to the true responses
bleu_score = corpus_bleu(ref_responses, pred_responses)

print(f"BLEU score: {bleu_score}")

### MAUDE score

In [None]:
#MAUDE score
import csv
import re

def calculate_maude_score(utterance):
    # Define empathy lexicon
    empathy_lexicon = {
        "affection": 1,
        "appreciation": 1,
        "approval": 1,
        "care": 1,
        "compassion": 1,
        "gratitude": 1,
        "love": 1,
        "pride": 1,
        "relief": 1,
        "calmness": 0.5,
        "confusion": 0.5,
        "curiosity": 0.5,
        "desire": 0.5,
        "excitement": 0.5,
        "fear": 0.5,
        "happiness": 0.5,
        "hope": 0.5,
        "interest": 0.5,
        "joy": 0.5,
        "nostalgia": 0.5,
        "pain": 0,
        "sadness": 0,
        "disappointment": -1,
        "disapproval": -1,
        "disgust": -1,
        "embarrassment": -1,
        "envy": -1,
        "fear": -1,
        "frustration": -1,
        "guilt": -1,
        "anger": -1,
        "hatred": -1,
        "hostility": -1,
        "irritation": -1,
        "jealousy": -1,
        "rage": -1,
        "shame": -1,
        "skepticism": -1
    }
    
    # Convert utterance to lowercase and remove punctuation
    utterance = re.sub(r'[^\w\s]', '', utterance.lower())
    
    # Split utterance into words
    words = utterance.split()
    
    # Calculate MAUDE score
    maude_score = sum(empathy_lexicon.get(word, 0) for word in words) / len(words)
    
    return maude_score

# Load dataset
dataset = []
with open('empathetic_dialogue_dataset.csv') as f:
    reader = csv.DictReader(f)
    for row in reader:
        dataset.append(row)

# Calculate MAUDE score for each utterance
maude_scores = []
for data in dataset:
    ground_truth_maude = calculate_maude_score(data['ground_truth'])
    model_response_maude = calculate_maude_score(data['model_response'])
    maude_scores.append((ground_truth_maude, model_response_maude))

# Calculate average MAUDE score
avg_ground_truth_maude = sum(score[0] for score in maude_scores) / len(maude_scores)
avg_model_response_maude = sum(score[1] for score in maude_scores) / len(maude_scores)

print("Average MAUDE score for ground truth utterances: ", avg_ground_truth_maude)
print("Average MAUDE score for model-generated utterances: ", avg_model_response_maude)


### Fluency score

In [None]:
#calculating fluency score

# Load English model for spaCy
nlp = spacy.load('en_core_web_sm')

def calculate_fluency_score(utterance):
    # Tokenize utterance
    doc = nlp(utterance)
    # Calculate average token probability using spaCy's token.prob attribute
    # A lower average token probability indicates a less fluent utterance
    fluency_score = sum([token.prob for token in doc]) / len(doc)
    return fluency_score

### Appropriateness score

In [None]:
# calculate appropriateness score
def calculate_appropriateness_score(utterance):
    # Create a list of inappropriate words or phrases to search for in the utterance
    inappropriate_words = ['hate', 'kill', 'stupid', 'ugly']
    # Tokenize utterance
    doc = nlp(utterance)
    # Check if any inappropriate words or phrases are present in the utterance
    for word in inappropriate_words:
        if word in [token.text.lower() for token in doc]:
            return 0
    # If no inappropriate words or phrases are found, return 1 as the appropriateness score
    return 1

In [None]:
# Calculate fluency and appropriateness scores for each utterance in the dataset
df['fluency_score'] = df['utterance'].apply(calculate_fluency_score)
df['appropriateness_score'] = df['utterance'].apply(calculate_appropriateness_score)

# Calculate the average fluency and appropriateness scores for the entire dataset
avg_fluency_score = df['fluency_score'].mean()
avg_appropriateness_score = df['appropriateness_score'].mean()

print(f"Average fluency score: {avg_fluency_score}")
print(f"Average appropriateness score: {avg_appropriateness_score}")

### Similarity metrics

###### load a pre-trained Word2Vec model
###### predict responses for the entire dataset using  trained model
###### calculate  cosine similarity between the word embeddings of the ground truth and predicted responses using cosine_similarity function.

In [None]:
df['utterance'] = df['utterance'].apply(preprocess_text)
df['response'] = df['response'].apply(preprocess_text)

# Load pre-trained Word2Vec model
w2v_model = Word2Vec.load('w2v_model.bin')

# Evaluate model on test set
cos_sim_scores = []
for i in range(len(df)):
    # Get ground truth and predicted responses
    true_response = df.iloc[i]['response']
    pred_response = model.predict(df.iloc[i]['utterance'])[0]
    pred_response = ' '.join([index_to_word[i] for i in pred_response])
    
    # Calculate cosine similarity between word embeddings of ground truth and predicted responses
    true_embedding = np.mean([w2v_model[word] for word in true_response.split()], axis=0)
    pred_embedding = np.mean([w2v_model[word] for word in pred_response.split()], axis=0)
    cos_sim_score = cosine_similarity([true_embedding], [pred_embedding])[0][0]
    
    cos_sim_scores.append(cos_sim_score)

# Calculate mean cosine similarity score for test set
mean_cos_sim_score = np.mean(cos_sim_scores)

print("Mean cosine similarity score: ", mean_cos_sim_score)
