# Implementation

## Step 1: Dataset Prepration

In [None]:
import pandas as pd

### Load the dataset

In [None]:
df = pd.read_csv("MeQSum_ACL2019_BenAbacha_Demner-Fushman.xlsx")

### Data Preprocessing

In [10]:
import re

In [11]:
def clean_text(text):
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    
    return text

In [146]:
# Apply the function to both columns
df['CHQ'] = df['CHQ'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

In [147]:
# Remove extremely short or extremely long sentences
df = df[df['CHQ'].apply(lambda x: len(x.split()) >= 3)]
df = df[df['Summary'].apply(lambda x: len(x.split()) >= 3)]

df = df[df['CHQ'].apply(lambda x: len(x.split()) <= 80)]
df = df[df['Summary'].apply(lambda x: len(x.split()) <= 80)]

In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 787 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   File     787 non-null    object
 1   CHQ      787 non-null    object
 2   Summary  787 non-null    object
dtypes: object(3)
memory usage: 24.6+ KB


26 rows are removed after cleaning.

In [149]:
from nltk.corpus import stopwords

In [150]:
# Remove stopwords which won't be informative for the model
stop_words = set(stopwords.words("english"))
df['CHQ'] = df['CHQ'].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))
df['Summary'] = df['Summary'].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))

In [151]:
df.head()

Unnamed: 0,File,CHQ,Summary
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine
2,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy
3,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome
4,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost
5,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas


## Step 2: Round-Trip Translation

### Translation using a machine translation model

In [17]:
# Using pre-trained MarianMT model for translation
from transformers import MarianMTModel, MarianTokenizer

In [163]:
# Load pretrained model and tokenizer for the translation
def load_translation_model(source: str, dest: str)-> tuple:
    
    model_name = f"Helsinki-NLP/opus-mt-{source}-{dest}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    
    model = MarianMTModel.from_pretrained(model_name)
   
    
    return model, tokenizer

In [164]:
# Function to Translate to Pivot Languages and Back
def translate(text: list, model: MarianMTModel, tokenizer: MarianTokenizer)-> str:
    
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**tokens)
    
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [11]:
# Setting up the computer to use the GPU
import torch

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Running on: {device}")

Running on: mps


In [None]:
rtt_questions = []
languages = ['es', 'de', 'it', 'zh', 'fr']

counter = 1
for question in df['CHQ']:
   
    print(f"Processing question {counter}...")
    counter += 1
    paraphrases = []
    for lang in languages:
        
        # Load models for each pivot language dynamically
        forward_model, forward_tokenizer = load_translation_model("en", lang)
        backward_model, backward_tokenizer = load_translation_model(lang, "en")

        # Forward and backward translation
        translated_text = translate(question, forward_model, forward_tokenizer)
        round_trip_text = translate(translated_text, backward_model, backward_tokenizer)

        paraphrases.append(round_trip_text)
    rtt_questions.append(paraphrases)

df['CHQ_paraphrases'] = rtt_questions

In [166]:
print("Original Question: ",df.CHQ[0])
print("After translating to five given languages: ",df.CHQ_paraphrases[0])

Original Question:  subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent
After translating to five given languages:  ['subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent', 'Subject get cetirizine message need to know manufactures cetirizine walmart looking for new care get new', 'subject get cetirillina message needy know manufacturers cetirillina Walmart looking for new ever recent supply', 'Retrieving cellirizine information needs to know that new supplies are being sought.', 'topic get cetirizine message need to know manufscturs cetirizine walmart search new offer get recent']


### Translate using Google Translate API

In [16]:
from deep_translator import GoogleTranslator

In [None]:
def google(question, pivot_language):
    
    translated = GoogleTranslator(source='en', target=pivot_language).translate(question)
    back_translated = GoogleTranslator(source=pivot_language, target='en').translate(translated)
    return back_translated

google_languages = ['es', 'de', 'it', 'zh-CN', 'fr']

df['CHQ_google_paraphrases'] = df['CHQ'].apply(lambda x: [google(x, lang) for lang in google_languages])


In [169]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info..."
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...


In [170]:
print("Translation by pretrained model: ",df.CHQ_paraphrases[0])
print("Translation by Google Translate: ",df.CHQ_google_paraphrases[0])

Translation by pretrained model:  ['subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent', 'Subject get cetirizine message need to know manufactures cetirizine walmart looking for new care get new', 'subject get cetirillina message needy know manufacturers cetirillina Walmart looking for new ever recent supply', 'Retrieving cellirizine information needs to know that new supplies are being sought.', 'topic get cetirizine message need to know manufscturs cetirizine walmart search new offer get recent']
Translation by Google Translate:  ['Subject Get Cetirizine Message Need Want To Know Manufacturers Cetirizine Walmart Looking For New Supply Get Recent', 'Subject: Cetirizine received message needwant to know manufacturer Cetirizine Walmart looking for new offer always up to date', 'object get cetirizine message needwant to know manufacturers cetirizine walmart looking new supply get recent', 'Topics Get Cetirizine Information Need t

## Question Selection

### Using FQD to select a subset of the new dataset

In [26]:
from transformers import BertTokenizer, BertModel

In [40]:
# Using Bert pretrained model for embedding both original and paraphrased questions.
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")


In [None]:
import numpy as np
from torch.nn.functional import cosine_similarity

In [53]:
import ast

In [56]:
# Function to embed text using the [CLS] token
def embed(text: str) -> torch.Tensor:
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state[:, 0, :].squeeze()


In [57]:
# FQD calculation using cosine similarity
def fqd(original_embedding: torch.Tensor, rtt_embedding: torch.Tensor) -> float:
   
    # Normalize the embeddings
    original_embedding = original_embedding / original_embedding.norm(p=2)
    rtt_embedding = rtt_embedding / rtt_embedding.norm(p=2)
    
    similarity = cosine_similarity(original_embedding.unsqueeze(0), rtt_embedding.unsqueeze(0))
    
    return 1 - similarity.item()

In [None]:
# Normalization function
def normalize_fqd(fqd_scores: list) -> list:
    
    if len(fqd_scores) == 0:
        return []
    
    fqd_min, fqd_max = np.min(fqd_scores), np.max(fqd_scores)
    
    if fqd_max - fqd_min == 0:
        return [0.0 for _ in fqd_scores]
    
    return [(fqd - fqd_min) / (fqd_max - fqd_min) for fqd in fqd_scores]

### Question Selection on Paraphrased questions generated by pretrained model.

In [61]:
# Process the DataFrame rows
fqd_scores = []

for index, row in df.iterrows():
    
    original_embedding = embed(row['CHQ'])
    
    # Get the paraphrases (convert from string if necessary)
    paraphrases = row['CHQ_paraphrases']
    if isinstance(paraphrases, str):
            paraphrases = ast.literal_eval(paraphrases)

    
    if not isinstance(paraphrases, list) or not paraphrases:
        fqd_scores.append([])
        continue
    
    # Compute embeddings and FQD scores for each paraphrase
    paraphrased_embeddings = [embed(paraphrase) for paraphrase in paraphrases]
    row_scores = [fqd(original_embedding, rtt_embedding) for rtt_embedding in paraphrased_embeddings]
    fqd_scores.append(row_scores)

# Normalize FQD scores per row
normalized_fqd_scores = [normalize_fqd(scores) if scores else [] for scores in fqd_scores]

# Add the results to the DataFrame
df['FQD_scores_MarianMT'] = pd.Series(normalized_fqd_scores)

In [None]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...,"[0.0, 0.48486472304708406, 0.5633985668165916,..."
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...,"[0.3234609707794634, 0.0, 0.36769626709513487,..."
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...,"[0.28597393014607475, 0.0, 0.2429463484297442,..."
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info...","[0.0145147587408779, 0.2677372835203137, 0.072..."
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...,"[0.33930216271550445, 0.04054892368388036, 0.0..."


### Question Selection on Paraphrased questions generated by Google Translate model.

In [63]:
# Process the DataFrame rows
fqd_scores = []

for index, row in df.iterrows():
    
    original_embedding = embed(row['CHQ'])
    
    # Get the paraphrases (convert from string if necessary)
    paraphrases = row['CHQ_google_paraphrases']
    if isinstance(paraphrases, str):
            paraphrases = ast.literal_eval(paraphrases)

    
    if not isinstance(paraphrases, list) or not paraphrases:
        fqd_scores.append([])
        continue
    
    # Compute embeddings and FQD scores for each paraphrase
    paraphrased_embeddings = [embed(paraphrase) for paraphrase in paraphrases]
    row_scores = [fqd(original_embedding, rtt_embedding) for rtt_embedding in paraphrased_embeddings]
    fqd_scores.append(row_scores)

# Normalize FQD scores per row
normalized_fqd_scores = [normalize_fqd(scores) if scores else [] for scores in fqd_scores]

# Add the results to the DataFrame
df['FQD_scores_MarianMT_Google'] = pd.Series(normalized_fqd_scores)

In [86]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...,"[0.0, 0.42345947, 0.39539498, 1.0, 0.40205756]","[0.39219925, 0.48206365, 0.0, 1.0, 0.81443405]"
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,"[0.15415767, 0.0, 0.109911725, 1.0, 0.31448457]","[0.38535395, 0.14393179, 0.15699339, 0.0, 1.0]"
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,"[0.38985512, 0.0, 0.18084231, 1.0, 0.4739606]","[0.9740065, 0.0, 0.5546767, 1.0, 0.65278566]"
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...,"[0.0, 0.05507936, 0.040062625, 1.0, 0.00857666]","[1.0, 0.10718062, 0.0, 0.81891173, 0.3620564]"
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,"[0.12911317, 0.0, 0.020611322, 1.0, 0.042890523]","[0.88640755, 0.091882594, 0.034803294, 1.0, 0.0]"


In [64]:
# Save the new dataframe containing paraphrases to a CSV file
df.to_csv("MeQSum_ACL2019_BenAbacha_Demner-Fushman.csv", index=False)

### Using PRQD to select a subset of the new dataset

In [71]:
from sentence_transformers import SentenceTransformer
from torch.nn.functional import softmax

In [66]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [68]:
# Embedding function for PRQD selection approach
def embed_2(text: str) -> torch.Tensor:
    inputs = model.encode([text], convert_to_tensor=True)
    return inputs[0]  # Return the embedding tensor

In [80]:

#  Convert an embedding into a probability distribution over its dimensions using softmax function.
def embedding_to_distribution(embedding: torch.Tensor) -> np.ndarray:

    # Ensure embedding is 1D
    if embedding.dim() != 1:
        embedding = embedding.squeeze()
   
    distribution = softmax(embedding, dim=0)
    # Detach, move to CPU, and convert to NumPy for element-wise operations
    return distribution.cpu().detach().numpy()


In [81]:
def prqd_distribution(ref_embedding: torch.Tensor,
            cand_embedding: torch.Tensor, alpha_values: list) -> float:
   
    best_f1 = 0.0

    # Convert embeddings to distributions over their dimensions
    hQ = embedding_to_distribution(ref_embedding)
    hQ_hat = embedding_to_distribution(cand_embedding)

    # Iterate over the provided alpha values
    for alpha in alpha_values:

        precision = np.sum(np.minimum(alpha * hQ, hQ_hat))
        recall = np.sum(np.minimum(hQ, hQ_hat / alpha))
        
        # Compute F1 (harmonic mean of precision and recall) if possible
        if (precision + recall) > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0.0

        # Keep track of the best F1 score over the alpha grid
        best_f1 = max(best_f1, f1)

    return best_f1

### Question Selection on Paraphrased questions generated by pretrained model.

In [83]:
# Define a grid of values for alpha
alpha_values = np.linspace(0.1, 10, 50)

# List to hold the PRQD scores for all rows
prqd_scores_all = []

for idx, row in df.iterrows():
    original_text = row['CHQ']
    paraphrases = row['CHQ_paraphrases']
    
    # If paraphrases are stored as a string, convert them to a list
    if isinstance(paraphrases, str):
        paraphrases = eval(paraphrases)
    
    # If no valid paraphrase list, append an empty list for this row
    if not isinstance(paraphrases, list) or len(paraphrases) == 0:
        prqd_scores_all.append([])
        continue
    
    # Compute the embedding for the original (gold) question
    original_embedding = embed_2(original_text)
    
    # List to store the PRQD score for each paraphrase for this row
    row_scores = []
    for paraphrase in paraphrases:
        # Compute the embedding for the candidate paraphrase
        candidate_embedding = embed_2(paraphrase)
        # Compute the PRQD score using our new distribution-based function
        score = prqd_distribution(original_embedding, candidate_embedding, alpha_values)
        row_scores.append(score)
    
    prqd_scores_all.append(row_scores)


df['PRQD_scores_MarianMT'] = pd.Series(prqd_scores_all)


In [87]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...,"[0.0, 0.48486472304708406, 0.5633985668165916,...","[0.06004893404528524, 0.14389553856288115, 0.0...","[0.9518717811990748, 0.9518717811990748, 0.951..."
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...,"[0.3234609707794634, 0.0, 0.36769626709513487,...","[0.11731507387433318, 0.2813033475907, 0.0, 0....","[0.9518717526094639, 0.9518717526094639, 0.951..."
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...,"[0.28597393014607475, 0.0, 0.2429463484297442,...","[0.9909218674200269, 0.0, 0.5261416925872023, ...","[0.9518717964911924, 0.9518717964911924, 0.951..."
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info...","[0.0145147587408779, 0.2677372835203137, 0.072...","[1.0, 0.20900258778387543, 0.0, 0.816339847409...","[0.9518716380293955, 0.9518716380293955, 0.951..."
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...,"[0.33930216271550445, 0.04054892368388036, 0.0...","[0.7831301658652918, 0.2697042776688785, 0.088...","[0.9518717468472168, 0.9518717468472168, 0.951..."


### Question Selection on Paraphrased questions generated by Google Translate.

In [88]:
# Define a grid of values for alpha
alpha_values = np.linspace(0.1, 10, 50)

# List to hold the PRQD scores for all rows
prqd_scores_all = []

for idx, row in df.iterrows():
    original_text = row['CHQ']
    paraphrases = row['CHQ_google_paraphrases']
    
    # If paraphrases are stored as a string, convert them to a list
    if isinstance(paraphrases, str):
        paraphrases = eval(paraphrases)
    
    # If no valid paraphrase list, append an empty list for this row
    if not isinstance(paraphrases, list) or len(paraphrases) == 0:
        prqd_scores_all.append([])
        continue
    
    # Compute the embedding for the original (gold) question
    original_embedding = embed_2(original_text)
    
    # List to store the PRQD score for each paraphrase for this row
    row_scores = []
    for paraphrase in paraphrases:
        # Compute the embedding for the candidate paraphrase
        candidate_embedding = embed_2(paraphrase)
        # Compute the PRQD score using our new distribution-based function
        score = prqd_distribution(original_embedding, candidate_embedding, alpha_values)
        row_scores.append(score)
    
    prqd_scores_all.append(row_scores)


df['PRQD_scores_Google'] = pd.Series(prqd_scores_all)


In [89]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...,"[0.0, 0.48486472304708406, 0.5633985668165916,...","[0.06004893404528524, 0.14389553856288115, 0.0...","[0.9518717811990748, 0.9518717811990748, 0.951...","[0.9518717811990748, 0.9518717811990748, 0.951..."
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...,"[0.3234609707794634, 0.0, 0.36769626709513487,...","[0.11731507387433318, 0.2813033475907, 0.0, 0....","[0.9518717526094639, 0.9518717526094639, 0.951...","[0.9518044117643911, 0.9518717526094639, 0.951..."
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...,"[0.28597393014607475, 0.0, 0.2429463484297442,...","[0.9909218674200269, 0.0, 0.5261416925872023, ...","[0.9518717964911924, 0.9518717964911924, 0.951...","[0.9518717964911924, 0.9518717964911924, 0.951..."
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info...","[0.0145147587408779, 0.2677372835203137, 0.072...","[1.0, 0.20900258778387543, 0.0, 0.816339847409...","[0.9518716380293955, 0.9518716380293955, 0.951...","[0.9518716380293955, 0.9518716380293955, 0.951..."
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...,"[0.33930216271550445, 0.04054892368388036, 0.0...","[0.7831301658652918, 0.2697042776688785, 0.088...","[0.9518717468472168, 0.9518717468472168, 0.951...","[0.9518565857349672, 0.9518717468472168, 0.951..."


In [102]:
# Convert the CHQ_paraphrases column from string to list since it is stored as a string.
def parse_paraphrases(entry):
    if isinstance(entry, str):
       
        return ast.literal_eval(entry)
       
    return entry

### Selecting subsets of parashrases generated by MarianMT.

In [117]:
# Store the parsed paraphrases in a new column
df["CHQ_paraphrases_parsed"] = df["CHQ_paraphrases"].apply(parse_paraphrases)

# Define thresholds for FQD and PRQD scores
FQD_MIN_THRESHOLD = 0.05
FQD_MAX_THRESHOLD = 0.8
PRQD_MIN_THRESHOLD = 0.8
PRQD_MAX_THRESHOLD = 0.99

optimal_paraphrase_indices = []

for fqd_list, prqd_list in zip(df["FQD_scores_MarianMT"], df["PRQD_scores_MarianMT"]):
    subset_indices = []
    # Iterate and select indices within optimal range
    for idx, (fqd, prqd) in enumerate(zip(fqd_list, prqd_list)):
        if FQD_MAX_THRESHOLD >= fqd >= FQD_MIN_THRESHOLD and PRQD_MIN_THRESHOLD <= prqd <= PRQD_MAX_THRESHOLD:
            subset_indices.append(idx)
    optimal_paraphrase_indices.append(subset_indices)

# Now, use the parsed lists to select the optimal paraphrases
df["Optimal_Paraphrases_MarianMT"] = [
    [df.loc[i, "CHQ_paraphrases_parsed"][idx] for idx in indices]
    for i, indices in enumerate(optimal_paraphrase_indices)
]


In [118]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google,CHQ_paraphrases_parsed,Optimal_Paraphrases_MarianMT
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...,"[0.0, 0.48486472304708406, 0.5633985668165916,...","[0.06004893404528524, 0.14389553856288115, 0.0...","[0.9518717811990748, 0.9518717811990748, 0.951...","[0.9518717811990748, 0.9518717811990748, 0.951...",[subject get cetirizine message needwant know ...,[Subject get cetirizine message need to know m...
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...,"[0.3234609707794634, 0.0, 0.36769626709513487,...","[0.11731507387433318, 0.2813033475907, 0.0, 0....","[0.9518717526094639, 0.9518717526094639, 0.951...","[0.9518044117643911, 0.9518717526094639, 0.951...",[Nulytely issue message hello say order nulyte...,[Nulytely issue message hello say order nulyte...
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...,"[0.28597393014607475, 0.0, 0.2429463484297442,...","[0.9909218674200269, 0.0, 0.5261416925872023, ...","[0.9518717964911924, 0.9518717964911924, 0.951...","[0.9518717964911924, 0.9518717964911924, 0.951...",[Williams syndrome as a daughter williams synd...,[Williams syndrome as a daughter williams synd...
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info...","[0.0145147587408779, 0.2677372835203137, 0.072...","[1.0, 0.20900258778387543, 0.0, 0.816339847409...","[0.9518716380293955, 0.9518716380293955, 0.951...","[0.9518716380293955, 0.9518716380293955, 0.951...",[clinicaltrialsgov asks general information de...,[clinicaltrialsgov Question general informatio...
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...,"[0.33930216271550445, 0.04054892368388036, 0.0...","[0.7831301658652918, 0.2697042776688785, 0.088...","[0.9518717468472168, 0.9518717468472168, 0.951...","[0.9518565857349672, 0.9518717468472168, 0.951...",[genetic test ihhs heart disease commercial ge...,[genetic test ihhs heart disease commercial ge...


### Selecting subsets of parashrases generated by Google Translate.

In [122]:
# Store the parsed paraphrases in a new column
df["CHQ_google_paraphrases_parsed"] = df["CHQ_google_paraphrases"].apply(parse_paraphrases)

# Define thresholds for FQD and PRQD scores
FQD_MIN_THRESHOLD = 0.05
FQD_MAX_THRESHOLD = 0.8
PRQD_MIN_THRESHOLD = 0.8
PRQD_MAX_THRESHOLD = 0.99

optimal_paraphrase_indices = []

for fqd_list, prqd_list in zip(df["FQD_scores_Google"], df["PRQD_scores_Google"]):
    subset_indices = []
    # Iterate and select indices within optimal range
    for idx, (fqd, prqd) in enumerate(zip(fqd_list, prqd_list)):
        if FQD_MAX_THRESHOLD >= fqd >= FQD_MIN_THRESHOLD and PRQD_MIN_THRESHOLD <= prqd <= PRQD_MAX_THRESHOLD:
            subset_indices.append(idx)
    optimal_paraphrase_indices.append(subset_indices)

# Now, use the parsed lists to select the optimal paraphrases
df["Optimal_Paraphrases_Google"] = [
    [df.loc[i, "CHQ_google_paraphrases_parsed"][idx] for idx in indices]
    for i, indices in enumerate(optimal_paraphrase_indices)
]

In [123]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google,CHQ_paraphrases_parsed,Optimal_Paraphrases_MarianMT,CHQ_google_paraphrases_parsed,Optimal_Paraphrases_Google
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...,"[0.0, 0.48486472304708406, 0.5633985668165916,...","[0.06004893404528524, 0.14389553856288115, 0.0...","[0.9518717811990748, 0.9518717811990748, 0.951...","[0.9518717811990748, 0.9518717811990748, 0.951...",[subject get cetirizine message needwant know ...,[Subject get cetirizine message need to know m...,[Subject Get Cetirizine Message Need Want To K...,[Subject Get Cetirizine Message Need Want To K...
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...,"[0.3234609707794634, 0.0, 0.36769626709513487,...","[0.11731507387433318, 0.2813033475907, 0.0, 0....","[0.9518717526094639, 0.9518717526094639, 0.951...","[0.9518044117643911, 0.9518717526094639, 0.951...",[Nulytely issue message hello say order nulyte...,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,[Subject nulllytely message hello tell me orde...
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...,"[0.28597393014607475, 0.0, 0.2429463484297442,...","[0.9909218674200269, 0.0, 0.5261416925872023, ...","[0.9518717964911924, 0.9518717964911924, 0.951...","[0.9518717964911924, 0.9518717964911924, 0.951...",[Williams syndrome as a daughter williams synd...,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,[williams syndrome i would like my daughter to...
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info...","[0.0145147587408779, 0.2677372835203137, 0.072...","[1.0, 0.20900258778387543, 0.0, 0.816339847409...","[0.9518716380293955, 0.9518716380293955, 0.951...","[0.9518716380293955, 0.9518716380293955, 0.951...",[clinicaltrialsgov asks general information de...,[clinicaltrialsgov Question general informatio...,[Question from Clinicaltrialsgov General infor...,[clinicaltrialsgov question general informatio...
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...,"[0.33930216271550445, 0.04054892368388036, 0.0...","[0.7831301658652918, 0.2697042776688785, 0.088...","[0.9518717468472168, 0.9518717468472168, 0.951...","[0.9518565857349672, 0.9518717468472168, 0.951...",[genetic test ihhs heart disease commercial ge...,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,[IHHS Heart Disease Genetic Testing Commercial...


## Summeriztion

We will use the T5 Model and Tokenizer to generate summaries.

In [124]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [125]:
# Load the model
model_ = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_)
model = T5ForConditionalGeneration.from_pretrained(model_)

In [126]:
# Summary generator function using T5 pretrained model
def generate_single_summary(texts, is_singular: bool = False):
    
    max_length = 10
    min_length = 1
    
    summaries = []
    counter = 1
    for text in texts:
        
        if (is_singular):
            print(f"Generating summary for question {counter}")
            counter += 1
            
        if not isinstance(text, str) or not text.strip():
            summaries.append("")
            continue
       
        input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
        output_ids = model.generate(input_ids, max_length=max_length, min_length=min_length, 
                                     length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        summaries.append(summary)
        
    return summaries

#### Generating summaries from raw questions.

In [127]:
df["Raw_CHQ_Summeries"] = generate_single_summary(df["CHQ"], True)

Generating summary for question 1
Generating summary for question 2
Generating summary for question 3
Generating summary for question 4
Generating summary for question 5
Generating summary for question 6
Generating summary for question 7
Generating summary for question 8
Generating summary for question 9
Generating summary for question 10
Generating summary for question 11
Generating summary for question 12
Generating summary for question 13
Generating summary for question 14
Generating summary for question 15
Generating summary for question 16
Generating summary for question 17
Generating summary for question 18
Generating summary for question 19
Generating summary for question 20
Generating summary for question 21
Generating summary for question 22
Generating summary for question 23
Generating summary for question 24
Generating summary for question 25
Generating summary for question 26
Generating summary for question 27
Generating summary for question 28
Generating summary for questi

#### Generating summeries for paraphrased questions generated by MarianMT

In [128]:
df['CHQ_paraphrase_summaries_MarianMT'] = df['Optimal_Paraphrases_MarianMT'].apply(lambda paraphrases: generate_single_summary(paraphrases))

#### Generating summeries for paraphrased questions generated by Google Translate

In [129]:
df['CHQ_paraphrase_summaries_Google'] = df['Optimal_Paraphrases_Google'].apply(lambda paraphrases: generate_single_summary(paraphrases))

In [131]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google,CHQ_paraphrases_parsed,Optimal_Paraphrases_MarianMT,CHQ_google_paraphrases_parsed,Optimal_Paraphrases_Google,Raw_CHQ_Summeries,CHQ_paraphrase_summaries_MarianMT,CHQ_paraphrase_summaries_Google
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...,"[0.0, 0.48486472304708406, 0.5633985668165916,...","[0.06004893404528524, 0.14389553856288115, 0.0...","[0.9518717811990748, 0.9518717811990748, 0.951...","[0.9518717811990748, 0.9518717811990748, 0.951...",[subject get cetirizine message needwant know ...,[Subject get cetirizine message need to know m...,[Subject Get Cetirizine Message Need Want To K...,[Subject Get Cetirizine Message Need Want To K...,subject get cetirizine message needwan,"[cetirizine message need to know manufacture, ...","[Cetirizine Message Need Want Want, Cetirizine..."
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...,"[0.3234609707794634, 0.0, 0.36769626709513487,...","[0.11731507387433318, 0.2813033475907, 0.0, 0....","[0.9518717526094639, 0.9518717526094639, 0.951...","[0.9518044117643911, 0.9518717526094639, 0.951...",[Nulytely issue message hello say order nulyte...,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,[Subject nulllytely message hello tell me orde...,subject nulytely message hello tell order,[order nulytely manufacture phone number calle...,"[nulllytely message hello tell me, Subject nul..."
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...,"[0.28597393014607475, 0.0, 0.2429463484297442,...","[0.9909218674200269, 0.0, 0.5261416925872023, ...","[0.9518717964911924, 0.9518717964911924, 0.951...","[0.9518717964911924, 0.9518717964911924, 0.951...",[Williams syndrome as a daughter williams synd...,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,[williams syndrome i would like my daughter to...,williams syndrome would like daughter tested,"[Williams syndrome as a daughter william, Will...","[williams syndrome i would like, williams synd..."
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info...","[0.0145147587408779, 0.2677372835203137, 0.072...","[1.0, 0.20900258778387543, 0.0, 0.816339847409...","[0.9518716380293955, 0.9518716380293955, 0.951...","[0.9518716380293955, 0.9518716380293955, 0.951...",[clinicaltrialsgov asks general information de...,[clinicaltrialsgov Question general informatio...,[Question from Clinicaltrialsgov General infor...,[clinicaltrialsgov question general informatio...,parents died location tx multiple mye,[clinicaltrialsgov Question general informatio...,[clinicaltrialsgov question general informatio...
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...,"[0.33930216271550445, 0.04054892368388036, 0.0...","[0.7831301658652918, 0.2697042776688785, 0.088...","[0.9518717468472168, 0.9518717468472168, 0.951...","[0.9518565857349672, 0.9518717468472168, 0.951...",[genetic test ihhs heart disease commercial ge...,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,[IHHS Heart Disease Genetic Testing Commercial...,genetic test ihhs heart condition,[genetic test ihhs heart disease],[IHHS Heart Disease Genetic Testing Commercial...


## Evaluation

### We will use Rouge and BLEU metrics to apply evaluation

In [132]:
import evaluate

In [133]:
# Load evaluation metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

### Before starting the evaluation, we need to do some row cleaning,

### since some of the rows have null generated summaries.

In [134]:
# Flatten lists in CHQ_paraphrase_summaries_MarianMT
df["CHQ_paraphrase_summaries_MarianMT_str"] = df["CHQ_paraphrase_summaries_MarianMT"].apply(
    lambda x: " ".join(x) if isinstance(x, list) else x)

In [135]:
# Flatten lists in CHQ_paraphrase_summaries_Google
df["CHQ_paraphrase_summaries_Google_str"] = df["CHQ_paraphrase_summaries_Google"].apply(
    lambda x: " ".join(x) if isinstance(x, list) else x
)

In [136]:
# Filter out empty or invalid rows
df = df[(df["Summary"].str.strip() != "") & (df["CHQ_paraphrase_summaries_MarianMT_str"].str.strip() != "") & (df["CHQ_paraphrase_summaries_Google_str"].str.strip() != "")]

In [137]:
df.head(10)

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google,CHQ_paraphrases_parsed,Optimal_Paraphrases_MarianMT,CHQ_google_paraphrases_parsed,Optimal_Paraphrases_Google,Raw_CHQ_Summeries,CHQ_paraphrase_summaries_MarianMT,CHQ_paraphrase_summaries_Google,CHQ_paraphrase_summaries_MarianMT_str,CHQ_paraphrase_summaries_Google_str
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...,"[0.0, 0.48486472304708406, 0.5633985668165916,...","[0.06004893404528524, 0.14389553856288115, 0.0...","[0.9518717811990748, 0.9518717811990748, 0.951...","[0.9518717811990748, 0.9518717811990748, 0.951...",[subject get cetirizine message needwant know ...,[Subject get cetirizine message need to know m...,[Subject Get Cetirizine Message Need Want To K...,[Subject Get Cetirizine Message Need Want To K...,subject get cetirizine message needwan,"[cetirizine message need to know manufacture, ...","[Cetirizine Message Need Want Want, Cetirizine...",cetirizine message need to know manufacture ce...,Cetirizine Message Need Want Want Cetirizine r...
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...,"[0.3234609707794634, 0.0, 0.36769626709513487,...","[0.11731507387433318, 0.2813033475907, 0.0, 0....","[0.9518717526094639, 0.9518717526094639, 0.951...","[0.9518044117643911, 0.9518717526094639, 0.951...",[Nulytely issue message hello say order nulyte...,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,[Subject nulllytely message hello tell me orde...,subject nulytely message hello tell order,[order nulytely manufacture phone number calle...,"[nulllytely message hello tell me, Subject nul...",order nulytely manufacture phone number called...,nulllytely message hello tell me Subject nulyt...
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...,"[0.28597393014607475, 0.0, 0.2429463484297442,...","[0.9909218674200269, 0.0, 0.5261416925872023, ...","[0.9518717964911924, 0.9518717964911924, 0.951...","[0.9518717964911924, 0.9518717964911924, 0.951...",[Williams syndrome as a daughter williams synd...,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,[williams syndrome i would like my daughter to...,williams syndrome would like daughter tested,"[Williams syndrome as a daughter william, Will...","[williams syndrome i would like, williams synd...",Williams syndrome as a daughter william Willia...,williams syndrome i would like williams syndro...
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info...","[0.0145147587408779, 0.2677372835203137, 0.072...","[1.0, 0.20900258778387543, 0.0, 0.816339847409...","[0.9518716380293955, 0.9518716380293955, 0.951...","[0.9518716380293955, 0.9518716380293955, 0.951...",[clinicaltrialsgov asks general information de...,[clinicaltrialsgov Question general informatio...,[Question from Clinicaltrialsgov General infor...,[clinicaltrialsgov question general informatio...,parents died location tx multiple mye,[clinicaltrialsgov Question general informatio...,[clinicaltrialsgov question general informatio...,clinicaltrialsgov Question general information...,clinicaltrialsgov question general information...
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...,"[0.33930216271550445, 0.04054892368388036, 0.0...","[0.7831301658652918, 0.2697042776688785, 0.088...","[0.9518717468472168, 0.9518717468472168, 0.951...","[0.9518565857349672, 0.9518717468472168, 0.951...",[genetic test ihhs heart disease commercial ge...,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,[IHHS Heart Disease Genetic Testing Commercial...,genetic test ihhs heart condition,[genetic test ihhs heart disease],[IHHS Heart Disease Genetic Testing Commercial...,genetic test ihhs heart disease,IHHS Heart Disease Genetic Testing Commercial ...
5,1-136003557.xml.txt,subject friedreichs ataxia message told friedr...,get genetic testing friedreichs treatments,['theme friedreichs ataxia message said friedr...,"[""subject friedreich's ataxia message said fri...","[0.18806983021771487, 0.0, 0.09461280917422596...","[0.3049874454319084, 0.5605119763286484, 1.0, ...","[0.951871787182947, 0.951871787182947, 0.95187...","[0.951871787182947, 0.951871787182947, 0.95187...",[theme friedreichs ataxia message said friedre...,[theme friedreichs ataxia message said friedre...,[subject friedreich's ataxia message said frie...,[subject friedreich's ataxia message said frie...,subject friedreichs ataxia message,"[theme friedreichs ataxia message, subject fri...","[subject friedreich's ataxia, topic Friedreich...",theme friedreichs ataxia message subject fried...,subject friedreich's ataxia topic Friedreich's...
6,1-133677895.xml.txt,subject trying loss july message trying loss p...,find information weight loss pills,['subject treating loss July message trying lo...,['topic trying to lose july message trying to ...,"[0.0, 0.0708670272727246, 0.06758713076269968,...","[0.11645893880157196, 1.0, 0.0, 0.407870432541...","[0.9518717796477006, 0.9518717796477006, 0.951...","[0.9518717796477006, 0.9518331523690831, 0.951...",[subject treating loss July message trying los...,[Topic Try Loss July News Try Loss Pounds July...,[topic trying to lose july message trying to l...,[topic trying to lose july message trying to l...,subject trying loss july message trying loss,"[Topic Try Loss July News Try Loss, subject te...","[topic trying to lose july message trying, Top...",Topic Try Loss July News Try Loss subject test...,topic trying to lose july message trying Topic...
7,4.txt,clinicaltrialsgov question specific study son ...,find information weber christian disease inclu...,['clinicaltrialsgov question specific study ar...,['Clinicaltrialsgov question specific study so...,"[0.09733399156253462, 0.05870957047842032, 0.1...","[0.3416668789971208, 0.0, 1.0, 0.4386348001121...","[0.9518611748269535, 0.9518716883382456, 0.951...","[0.9518716883382456, 0.9518716883382456, 0.951...",[clinicaltrialsgov question specific study are...,[clinicaltrialsgov question specific study are...,[Clinicaltrialsgov question specific study son...,[Clinicaltrialsgov question specific study son...,clinicaltrialsgov question specific study son,[clinicaltrialsgov question specific study are...,[clinicaltrialsgov question specific study son...,clinicaltrialsgov question specific study are ...,clinicaltrialsgov question specific study son ...
8,10128.xml.txt,vagual nerve stimulation depression suffered m...,find information vagus nerve stimulation treat...,['vacual nerve stimulation depression suffered...,['vagus nerve stimulation depression I suffere...,"[0.0, 0.030593008825101672, 0.1690470819573200...","[1.0, 0.3964311225265206, 0.19357921213410742,...","[0.9518716865652466, 0.9518716865652466, 0.951...","[0.9518716865652466, 0.9518716865652466, 0.951...",[vacual nerve stimulation depression suffered ...,[depression stimulation of the vaguage nerve s...,[vagus nerve stimulation depression I suffered...,[Vagua Nervus Stimulation Depression suffered ...,vagual nerve stimulation depression suffered m...,[depression stimulation of the vaguage nerve s...,"[Vagua Nervus Stimulation Depression, vagus ne...",depression stimulation of the vaguage nerve su...,Vagua Nervus Stimulation Depression vagus nerv...
9,10391.xml.txt,stomach pain son facing upper abdominal pain s...,find information upper abdominal pain includin...,['stomach pain son in front of abdominal pain ...,['Stomach ache. My son has upper abdominal pai...,"[0.04437846475171558, 0.0, 0.05408367519350935...","[0.7885811773087619, 0.4686055108038832, 0.0, ...","[0.9518716761488767, 0.9518716761488767, 0.951...","[0.9518716761488767, 0.9518716761488767, 0.951...",[stomach pain son in front of abdominal pain s...,[stomach pain child facing upper abdominal pai...,[Stomach ache. My son has upper abdominal pain...,[Stomach ache. My son has upper abdominal pain...,stomach pain son facing upper abdominal pain s...,[stomach pain child facing upper abdominal pai...,"[my son has upper abdominal pain., i ate light...",stomach pain child facing upper abdominal pain...,my son has upper abdominal pain. i ate lightly...


In [138]:
def eval(reference_texts, generated_texts)-> tuple:
    
    # Remove empty references or generated summaries
    cleaned_data = [(ref, gen) for ref, gen in zip(reference_texts, generated_texts) 
                    if ref.strip() and gen.strip()]
 
    # Unpack cleaned data
    cleaned_references, cleaned_generated = zip(*cleaned_data)

    # Compute ROUGE scores using the raw strings
    rouge_scores = rouge_metric.compute(
        predictions=list(cleaned_generated), 
        references=list(cleaned_references)
    )
    
    # For BLEU:
    # predictions is a list of strings
    # references is a list of lists of strings (one reference per prediction)
    bleu_score = bleu_metric.compute(
        predictions=list(cleaned_generated), 
        references=[[ref] for ref in cleaned_references]
    )
    
    return rouge_scores, bleu_score

### Evaluating paraphrase summaries from questions generated by MarianMT pretrained model.

In [143]:

rouge_scores, bleu_score = eval(
    df["Summary"].tolist(), 
    df["CHQ_paraphrase_summaries_MarianMT_str"].tolist()
)

print("Results for paraphrases generated by MarianMT model:")
print("ROUGE Scores:", rouge_scores)
print("BLEU Score:", bleu_score)

Results for paraphrases generated by MarianMT model:
ROUGE Scores: {'rouge1': np.float64(0.15776680678908958), 'rouge2': np.float64(0.04515783185029319), 'rougeL': np.float64(0.1491152658459397), 'rougeLsum': np.float64(0.14908327632133256)}
BLEU Score: {'bleu': 0.009286322633011625, 'precisions': [0.09871328946076025, 0.02784542085759661, 0.00539609644087256, 0.000501378791677112], 'brevity_penalty': 1.0, 'length_ratio': 2.771848625102096, 'translation_length': 10181, 'reference_length': 3673}


### Evaluating paraphrase summaries from questions generated by Google translate.

In [None]:
rouge_scores, bleu_score = eval(
    df["Summary"].tolist(), 
    df["CHQ_paraphrase_summaries_Google_str"].tolist()
)

print("Results for paraphrases generated by Google Translate:")
print("ROUGE Scores:", rouge_scores)
print("BLEU Score:", bleu_score)

Results for paraphrases generated by  Google Translate:
ROUGE Scores: {'rouge1': np.float64(0.16934945447715366), 'rouge2': np.float64(0.056249468423383964), 'rougeL': np.float64(0.16083201174958245), 'rougeLsum': np.float64(0.16085861334727197)}
BLEU Score: {'bleu': 0.010993821836385236, 'precisions': [0.09743395511788656, 0.030127226463104326, 0.005941247662009022, 0.0008376211559171952], 'brevity_penalty': 1.0, 'length_ratio': 2.875306289136945, 'translation_length': 10561, 'reference_length': 3673}


### Evaluating summaries from raw questions.

In [141]:
rouge_scores, bleu_score = eval(
    df["Summary"].tolist(), 
    df["Raw_CHQ_Summeries"].tolist()
)

In [146]:
print("Result for summaries generated from original questions:")
print("ROUGE Scores:", rouge_scores)
print("BLEU Score:", bleu_score)

Result for summaries generated from original questions:
ROUGE Scores: {'rouge1': np.float64(0.16934945447715366), 'rouge2': np.float64(0.056249468423383964), 'rougeL': np.float64(0.16083201174958245), 'rougeLsum': np.float64(0.16085861334727197)}
BLEU Score: {'bleu': 0.010993821836385236, 'precisions': [0.09743395511788656, 0.030127226463104326, 0.005941247662009022, 0.0008376211559171952], 'brevity_penalty': 1.0, 'length_ratio': 2.875306289136945, 'translation_length': 10561, 'reference_length': 3673}
