# Implementation

## Step 1: Dataset Prepration

In [2]:
import pandas as pd

### Load the dataset

In [3]:
df = pd.read_csv("MeQSum_ACL2019_BenAbacha_Demner-Fushman.xlsx")

In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   File     1000 non-null   object
 1   CHQ      1000 non-null   object
 2   Summary  1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


### Data Preprocessing

In [7]:
import re

In [145]:
def clean_text(text):
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    
    return text

In [146]:
# Apply the function to both columns
df['CHQ'] = df['CHQ'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

In [147]:
# Remove extremely short or extremely long sentences
df = df[df['CHQ'].apply(lambda x: len(x.split()) >= 3)]
df = df[df['Summary'].apply(lambda x: len(x.split()) >= 3)]

df = df[df['CHQ'].apply(lambda x: len(x.split()) <= 80)]
df = df[df['Summary'].apply(lambda x: len(x.split()) <= 80)]

In [148]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 787 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   File     787 non-null    object
 1   CHQ      787 non-null    object
 2   Summary  787 non-null    object
dtypes: object(3)
memory usage: 24.6+ KB


26 rows are removed after cleaning.

In [149]:
from nltk.corpus import stopwords

In [150]:
# Remove stopwords which won't be informative for the model
stop_words = set(stopwords.words("english"))
df['CHQ'] = df['CHQ'].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))
df['Summary'] = df['Summary'].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))

In [151]:
df.head()

Unnamed: 0,File,CHQ,Summary
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine
2,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy
3,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome
4,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost
5,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas


## Step 2: Round-Trip Translation

### Translation using a machine translation model

In [17]:
# Using pre-trained MarianMT model for translation
from transformers import MarianMTModel, MarianTokenizer

In [163]:
# Load pretrained model and tokenizer for the translation
def load_translation_model(source: str, dest: str)-> tuple:
    
    model_name = f"Helsinki-NLP/opus-mt-{source}-{dest}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    
    model = MarianMTModel.from_pretrained(model_name)
   
    
    return model, tokenizer

In [164]:
# Function to Translate to Pivot Languages and Back
def translate(text: list, model: MarianMTModel, tokenizer: MarianTokenizer)-> str:
    
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**tokens)
    
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [11]:
# Setting up the computer to use the GPU
import torch

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Running on: {device}")

Running on: mps


In [None]:
rtt_questions = []
languages = ['es', 'de', 'it', 'zh', 'fr']

counter = 1
for question in df['CHQ']:
   
    print(f"Processing question {counter}...")
    counter += 1
    paraphrases = []
    for lang in languages:
        
        # Load models for each pivot language dynamically
        forward_model, forward_tokenizer = load_translation_model("en", lang)
        backward_model, backward_tokenizer = load_translation_model(lang, "en")

        # Forward and backward translation
        translated_text = translate(question, forward_model, forward_tokenizer)
        round_trip_text = translate(translated_text, backward_model, backward_tokenizer)

        paraphrases.append(round_trip_text)
    rtt_questions.append(paraphrases)

df['CHQ_paraphrases'] = rtt_questions

In [166]:
print("Original Question: ",df.CHQ[0])
print("After translating to five given languages: ",df.CHQ_paraphrases[0])

Original Question:  subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent
After translating to five given languages:  ['subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent', 'Subject get cetirizine message need to know manufactures cetirizine walmart looking for new care get new', 'subject get cetirillina message needy know manufacturers cetirillina Walmart looking for new ever recent supply', 'Retrieving cellirizine information needs to know that new supplies are being sought.', 'topic get cetirizine message need to know manufscturs cetirizine walmart search new offer get recent']


### Translate using Google Translate API

In [16]:
from deep_translator import GoogleTranslator

In [None]:
def google(question, pivot_language):
    
    translated = GoogleTranslator(source='en', target=pivot_language).translate(question)
    back_translated = GoogleTranslator(source=pivot_language, target='en').translate(translated)
    return back_translated

google_languages = ['es', 'de', 'it', 'zh-CN', 'fr']

df['CHQ_google_paraphrases'] = df['CHQ'].apply(lambda x: [google(x, lang) for lang in google_languages])


In [169]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,['subject get cetirizine message needwant know...,['Subject Get Cetirizine Message Need Want To ...
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,['Nulytely issue message hello say order nulyt...,['Subject nulllytely message hello tell me ord...
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,['Williams syndrome as a daughter williams syn...,['Williams Syndrome I would like my daughter t...
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,['clinicaltrialsgov asks general information d...,"[""Question from Clinicaltrialsgov General info..."
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,['genetic test ihhs heart disease commercial g...,['IHHS Heart Disease Genetic Testing Commercia...


In [170]:
print("Translation by pretrained model: ",df.CHQ_paraphrases[0])
print("Translation by Google Translate: ",df.CHQ_google_paraphrases[0])

Translation by pretrained model:  ['subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent', 'Subject get cetirizine message need to know manufactures cetirizine walmart looking for new care get new', 'subject get cetirillina message needy know manufacturers cetirillina Walmart looking for new ever recent supply', 'Retrieving cellirizine information needs to know that new supplies are being sought.', 'topic get cetirizine message need to know manufscturs cetirizine walmart search new offer get recent']
Translation by Google Translate:  ['Subject Get Cetirizine Message Need Want To Know Manufacturers Cetirizine Walmart Looking For New Supply Get Recent', 'Subject: Cetirizine received message needwant to know manufacturer Cetirizine Walmart looking for new offer always up to date', 'object get cetirizine message needwant to know manufacturers cetirizine walmart looking new supply get recent', 'Topics Get Cetirizine Information Need t

## Question Selection

### Using FQD to select a subset of the new dataset

In [72]:
from transformers import BertTokenizer, BertModel

In [73]:
# Using Bert pretrained model for embedding both original and paraphrased questions.
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [74]:
# Fuction to embed the text
def embed(text: str)-> torch.Tensor:
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
   
    with torch.no_grad():
        outputs = model(**inputs)
    
    return torch.mean(outputs.last_hidden_state, dim=1).squeeze()

In [75]:
from scipy.spatial.distance import cosine
import numpy as np

In [76]:
# FQD calculation
def fqd(original_embedding: torch.Tensor,
        rtt_embedding: torch.Tensor)-> float:
    
    similarity = 1 - cosine(original_embedding, rtt_embedding)
    
    return 1 - similarity  

In [77]:
# Min Max Normalization
def normalize_fqd(fqd_scores: float)-> list:
    
    fqd_min, fqd_max = np.min(fqd_scores), np.max(fqd_scores)
    return [(fqd - fqd_min) / (fqd_max - fqd_min) for fqd in fqd_scores]

### Question Selection on Paraphrased questions generated by pretrained model.

In [78]:
# Now, we apply the above functions to calculate the FQD scores for each question
fqd_scores = []

for index, row in df.iterrows():
    original_embedding = embed(row['CHQ'])

    # Ensure row['CHQ_paraphrases'] contains valid data
    if not isinstance(row['CHQ_paraphrases'], list) or not row['CHQ_paraphrases']:
        fqd_scores.append([])
        continue
    
    # Embed the paraphrased questions
    paraphrased_embeddings = [embed(paraphrase) for paraphrase in row['CHQ_paraphrases']]
    
    # Compute FQD for each paraphrased question
    row_scores = [fqd(original_embedding, rtt_embedding) for rtt_embedding in paraphrased_embeddings]
    fqd_scores.append(row_scores)

In [79]:
# Normalize fqd scores
normalized_fqd_scores = [normalize_fqd(scores) if scores else [0] for scores in fqd_scores]

# Store the results in the DataFrame as a new column called "FQD_scores_MarianMT"
df['FQD_scores_MarianMT'] = pd.Series(normalized_fqd_scores)

In [83]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...,"[0.0, 0.42345947, 0.39539498, 1.0, 0.40205756]"
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,"[0.15415767, 0.0, 0.109911725, 1.0, 0.31448457]"
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,"[0.38985512, 0.0, 0.18084231, 1.0, 0.4739606]"
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...,"[0.0, 0.05507936, 0.040062625, 1.0, 0.00857666]"
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,"[0.12911317, 0.0, 0.020611322, 1.0, 0.042890523]"


### Question Selection on Paraphrased questions generated by Google Translate model.

In [84]:
# Now, we apply the above functions to calculate the FQD scores for each question
fqd_scores = []

for index, row in df.iterrows():
    original_embedding = embed(row['CHQ'])

    # Ensure row['CHQ_paraphrases'] contains valid data
    if not isinstance(row['CHQ_google_paraphrases'], list) or not row['CHQ_google_paraphrases']:
        fqd_scores.append([])
        continue
    
    # Embed the paraphrased questions
    paraphrased_embeddings = [embed(paraphrase) for paraphrase in row['CHQ_google_paraphrases']]
    
    # Compute FQD for each paraphrased question
    row_scores = [fqd(original_embedding, rtt_embedding) for rtt_embedding in paraphrased_embeddings]
    fqd_scores.append(row_scores)

In [85]:
# Normalize fqd scores
normalized_fqd_scores = [normalize_fqd(scores) if scores else [0] for scores in fqd_scores]

# Store the results in the DataFrame as a new column called "FQD_scores_Google"
df['FQD_scores_Google'] = pd.Series(normalized_fqd_scores)

In [86]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...,"[0.0, 0.42345947, 0.39539498, 1.0, 0.40205756]","[0.39219925, 0.48206365, 0.0, 1.0, 0.81443405]"
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,"[0.15415767, 0.0, 0.109911725, 1.0, 0.31448457]","[0.38535395, 0.14393179, 0.15699339, 0.0, 1.0]"
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,"[0.38985512, 0.0, 0.18084231, 1.0, 0.4739606]","[0.9740065, 0.0, 0.5546767, 1.0, 0.65278566]"
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...,"[0.0, 0.05507936, 0.040062625, 1.0, 0.00857666]","[1.0, 0.10718062, 0.0, 0.81891173, 0.3620564]"
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,"[0.12911317, 0.0, 0.020611322, 1.0, 0.042890523]","[0.88640755, 0.091882594, 0.034803294, 1.0, 0.0]"


In [101]:
# Save the new dataframe containing paraphrases to a CSV file
df.to_csv("MeQSum_ACL2019_BenAbacha_Demner-Fushman.csv", index=False)

### Using PRQD to select a subset of the new dataset

In [88]:
from sentence_transformers import SentenceTransformer, util

In [89]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [90]:
def embed_2(text: str) -> torch.Tensor:
    inputs = model.encode([text], convert_to_tensor=True)
    return inputs[0]  # Return the embedding tensor

In [91]:
def prqd(original_embedding: torch.Tensor, candidate_embedding: torch.Tensor, alpha_values: list) -> tuple:
    """
    Compute PRQD for the pair of embeddings, returning (max_f1_score, optimal_alpha).
    """
    best_f1 = 0.0
    #best_alpha = None

    # Convert embeddings to NumPy arrays and handle potential errors
    ref_vector = original_embedding.cpu().numpy()
    cand_vector = candidate_embedding.cpu().numpy()

    # Normalize the vectors if possible
    ref_norm = np.linalg.norm(ref_vector)
    cand_norm = np.linalg.norm(cand_vector)
    
    # Return a default if vectors are degenerate (norms zero)
    if ref_norm == 0 or cand_norm == 0:
        return 0.0, None

    ref_vector /= ref_norm
    cand_vector /= cand_norm

    similarity = util.cos_sim(torch.tensor(ref_vector), torch.tensor(cand_vector)).item()

    for alpha in alpha_values:
        # Compute precision and recall
        prec = min(alpha * similarity, 1.0)
        rec = min(similarity / alpha, 1.0)
        
        f1 = 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0

        if f1 > best_f1:
            best_f1 = f1
            best_alpha = alpha

    return best_f1


### Question Selection on Paraphrased questions generated by pretrained model.

In [92]:
alpha_values = np.linspace(0.1, 10, 50)

# Compute PRQD scores for each paraphrase candidate for each row
prqd_scores_all = []
for index, row in df.iterrows():
    original_text = row['CHQ']
    # Ensure the paraphrases are in list form. If stored as a string, eval() them.
    paraphrases = row['CHQ_paraphrases']
    if isinstance(paraphrases, str):
        paraphrases = eval(paraphrases)
    
    if not isinstance(paraphrases, list) or len(paraphrases) == 0:
        prqd_scores_all.append([])
        continue

    original_embedding = embed_2(original_text)
    
    # Compute PRQD for each paraphrased question
    row_scores = []
    for paraphrase in paraphrases:
        candidate_embedding = embed_2(paraphrase)
        score = prqd(original_embedding, candidate_embedding, alpha_values)
        row_scores.append(score)
    
    prqd_scores_all.append(row_scores)

# Store the list of PRQD scores in a new DataFrame column
df['PRQD_scores_MarianMT'] = pd.Series(prqd_scores_all)


In [93]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...,"[0.0, 0.42345947, 0.39539498, 1.0, 0.40205756]","[0.39219925, 0.48206365, 0.0, 1.0, 0.81443405]","[0.9518717766871578, 0.8761717593618875, 0.783..."
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,"[0.15415767, 0.0, 0.109911725, 1.0, 0.31448457]","[0.38535395, 0.14393179, 0.15699339, 0.0, 1.0]","[0.9082987511477136, 0.9331645271157469, 0.894..."
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,"[0.38985512, 0.0, 0.18084231, 1.0, 0.4739606]","[0.9740065, 0.0, 0.5546767, 1.0, 0.65278566]","[0.9307867669397486, 0.9331759245217522, 0.933..."
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...,"[0.0, 0.05507936, 0.040062625, 1.0, 0.00857666]","[1.0, 0.10718062, 0.0, 0.81891173, 0.3620564]","[0.931612362358516, 0.9440155973025558, 0.9121..."
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,"[0.12911317, 0.0, 0.020611322, 1.0, 0.042890523]","[0.88640755, 0.091882594, 0.034803294, 1.0, 0.0]","[0.8464547354008434, 0.946377974986297, 0.9308..."


### Question Selection on Paraphrased questions generated by Google Translate.

In [94]:
alpha_values = np.linspace(0.1, 10, 50)

# Compute PRQD scores for each paraphrase candidate for each row
prqd_scores_all = []
for index, row in df.iterrows():
    original_text = row['CHQ']
    # Ensure the paraphrases are in list form. If stored as a string, eval() them.
    paraphrases = row['CHQ_google_paraphrases']
    if isinstance(paraphrases, str):
        paraphrases = eval(paraphrases)
    
    if not isinstance(paraphrases, list) or len(paraphrases) == 0:
        prqd_scores_all.append([])
        continue

    original_embedding = embed_2(original_text)
    
    # Compute PRQD for each paraphrased question
    row_scores = []
    for paraphrase in paraphrases:
        candidate_embedding = embed_2(paraphrase)
        score = prqd(original_embedding, candidate_embedding, alpha_values)
        row_scores.append(score)
    
    prqd_scores_all.append(row_scores)

# Store the list of PRQD scores in a new DataFrame column
df['PRQD_scores_Google'] = pd.Series(prqd_scores_all)


In [95]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...,"[0.0, 0.42345947, 0.39539498, 1.0, 0.40205756]","[0.39219925, 0.48206365, 0.0, 1.0, 0.81443405]","[0.9518717766871578, 0.8761717593618875, 0.783...","[0.9296710567502826, 0.8779658729781777, 0.914..."
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,"[0.15415767, 0.0, 0.109911725, 1.0, 0.31448457]","[0.38535395, 0.14393179, 0.15699339, 0.0, 1.0]","[0.9082987511477136, 0.9331645271157469, 0.894...","[0.7523785724585474, 0.9290950858177544, 0.843..."
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,"[0.38985512, 0.0, 0.18084231, 1.0, 0.4739606]","[0.9740065, 0.0, 0.5546767, 1.0, 0.65278566]","[0.9307867669397486, 0.9331759245217522, 0.933...","[0.9072652325837058, 0.9412010101200736, 0.910..."
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...,"[0.0, 0.05507936, 0.040062625, 1.0, 0.00857666]","[1.0, 0.10718062, 0.0, 0.81891173, 0.3620564]","[0.931612362358516, 0.9440155973025558, 0.9121...","[0.9253784646693647, 0.9430538900668469, 0.934..."
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,"[0.12911317, 0.0, 0.020611322, 1.0, 0.042890523]","[0.88640755, 0.091882594, 0.034803294, 1.0, 0.0]","[0.8464547354008434, 0.946377974986297, 0.9308...","[0.8438598561933139, 0.8838341789272814, 0.883..."


### Selecting subsets of parashrases generated by MarianMT.

In [104]:
# Define thresholds for FQD and PRQD scores
FQD_MIN_THRESHOLD = 0.05
FQD_MAX_THRESHOLD = 0.8
PRQD_MIN_THRESHOLD = 0.8
PRQD_MAX_THRESHOLD = 0.95

optimal_paraphrase_indices = []

for fqd_list, prqd_list in zip(df["FQD_scores_MarianMT"], df["PRQD_scores_MarianMT"]):
    subset_indices = []

    # Iterate and select indices within optimal range
    for idx, (fqd, prqd) in enumerate(zip(fqd_list, prqd_list)):
        if FQD_MAX_THRESHOLD >=fqd >= FQD_MIN_THRESHOLD and PRQD_MIN_THRESHOLD <= prqd <= PRQD_MAX_THRESHOLD:
            subset_indices.append(idx)

    optimal_paraphrase_indices.append(subset_indices)

# Store the results in a new column
df["Optimal_Paraphrases_MarianMT"] = [
    [df.loc[i, "CHQ_paraphrases"][idx] for idx in indices]
    for i, indices in enumerate(optimal_paraphrase_indices)
]

#print(df[["CHQ", "Optimal_Paraphrases"]])


In [105]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google,Optimal_Paraphrases_MarianMT
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...,"[0.0, 0.42345947, 0.39539498, 1.0, 0.40205756]","[0.39219925, 0.48206365, 0.0, 1.0, 0.81443405]","[0.9518717766871578, 0.8761717593618875, 0.783...","[0.9296710567502826, 0.8779658729781777, 0.914...",[Subject get cetirizine message need to know m...
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,"[0.15415767, 0.0, 0.109911725, 1.0, 0.31448457]","[0.38535395, 0.14393179, 0.15699339, 0.0, 1.0]","[0.9082987511477136, 0.9331645271157469, 0.894...","[0.7523785724585474, 0.9290950858177544, 0.843...",[Nulytely issue message hello say order nulyte...
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,"[0.38985512, 0.0, 0.18084231, 1.0, 0.4739606]","[0.9740065, 0.0, 0.5546767, 1.0, 0.65278566]","[0.9307867669397486, 0.9331759245217522, 0.933...","[0.9072652325837058, 0.9412010101200736, 0.910...",[Williams syndrome as a daughter williams synd...
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...,"[0.0, 0.05507936, 0.040062625, 1.0, 0.00857666]","[1.0, 0.10718062, 0.0, 0.81891173, 0.3620564]","[0.931612362358516, 0.9440155973025558, 0.9121...","[0.9253784646693647, 0.9430538900668469, 0.934...",[clinicaltrialsgov Question general informatio...
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,"[0.12911317, 0.0, 0.020611322, 1.0, 0.042890523]","[0.88640755, 0.091882594, 0.034803294, 1.0, 0.0]","[0.8464547354008434, 0.946377974986297, 0.9308...","[0.8438598561933139, 0.8838341789272814, 0.883...",[genetic test ihhs heart disease commercial ge...


In [102]:
df.drop(columns="Optimal_Paraphrases_MarianMT", inplace=True)   

### Selecting subsets of parashrases generated by Google Translate.

In [106]:
# Define thresholds for FQD and PRQD scores
FQD_MIN_THRESHOLD = 0.05
FQD_MAX_THRESHOLD = 0.8
PRQD_MIN_THRESHOLD = 0.8
PRQD_MAX_THRESHOLD = 0.95

optimal_paraphrase_indices = []

for fqd_list, prqd_list in zip(df["FQD_scores_Google"], df["PRQD_scores_Google"]):
    subset_indices = []

    # Iterate and select indices within optimal range
    for idx, (fqd, prqd) in enumerate(zip(fqd_list, prqd_list)):
        if FQD_MAX_THRESHOLD >=fqd >= FQD_MIN_THRESHOLD and PRQD_MIN_THRESHOLD <= prqd <= PRQD_MAX_THRESHOLD:
            subset_indices.append(idx)

    optimal_paraphrase_indices.append(subset_indices)

# Store the results in a new column
df["Optimal_Paraphrases_Google"] = [
    [df.loc[i, "CHQ_google_paraphrases"][idx] for idx in indices]
    for i, indices in enumerate(optimal_paraphrase_indices)
]


In [107]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases,FQD_scores_MarianMT,FQD_scores_Google,PRQD_scores_MarianMT,PRQD_scores_Google,Optimal_Paraphrases_MarianMT,Optimal_Paraphrases_Google
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...,"[0.0, 0.42345947, 0.39539498, 1.0, 0.40205756]","[0.39219925, 0.48206365, 0.0, 1.0, 0.81443405]","[0.9518717766871578, 0.8761717593618875, 0.783...","[0.9296710567502826, 0.8779658729781777, 0.914...",[Subject get cetirizine message need to know m...,[Subject Get Cetirizine Message Need Want To K...
1,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...,"[0.15415767, 0.0, 0.109911725, 1.0, 0.31448457]","[0.38535395, 0.14393179, 0.15699339, 0.0, 1.0]","[0.9082987511477136, 0.9331645271157469, 0.894...","[0.7523785724585474, 0.9290950858177544, 0.843...",[Nulytely issue message hello say order nulyte...,[Subject nulytely Message Say hello Order nuly...
2,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...,"[0.38985512, 0.0, 0.18084231, 1.0, 0.4739606]","[0.9740065, 0.0, 0.5546767, 1.0, 0.65278566]","[0.9307867669397486, 0.9331759245217522, 0.933...","[0.9072652325837058, 0.9412010101200736, 0.910...",[Williams syndrome as a daughter williams synd...,[williams syndrome i would like my daughter to...
3,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...,"[0.0, 0.05507936, 0.040062625, 1.0, 0.00857666]","[1.0, 0.10718062, 0.0, 0.81891173, 0.3620564]","[0.931612362358516, 0.9440155973025558, 0.9121...","[0.9253784646693647, 0.9430538900668469, 0.934...",[clinicaltrialsgov Question general informatio...,[clinicaltrialsgov question general informatio...
4,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...,"[0.12911317, 0.0, 0.020611322, 1.0, 0.042890523]","[0.88640755, 0.091882594, 0.034803294, 1.0, 0.0]","[0.8464547354008434, 0.946377974986297, 0.9308...","[0.8438598561933139, 0.8838341789272814, 0.883...",[genetic test ihhs heart disease commercial ge...,[genetic test IHHS heart disease commercial ge...
