# Implementation

## Step 1: Dataset Prepration

In [1]:
import pandas as pd

### Load the dataset

In [2]:
df = pd.read_excel("MeQSum_ACL2019_BenAbacha_Demner-Fushman.xlsx")

In [3]:
df.head()

Unnamed: 0,File,CHQ,Summary
0,1-131188152.xml.txt,SUBJECT: who and where to get cetirizine - D\n...,Who manufactures cetirizine?
1,14348.txt,who makes bromocriptine\ni am wondering what c...,Who manufactures bromocriptine?
2,1-131985747.xml.txt,SUBJECT: nulytely\nMESSAGE: Hello can you tell...,"Who makes nulytely, and where can I buy it?"
3,15410.txt,Williams' syndrome\nI would like to have my da...,Where can I get genetic testing for william's ...
4,35.txt,ClinicalTrials.gov - Question - general inform...,Where can I get genetic testing for multiple m...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   File     1000 non-null   object
 1   CHQ      1000 non-null   object
 2   Summary  1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


### Data Preprocessing

In [5]:
import re

In [6]:
def clean_text(text):
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    
    return text

In [7]:
# Apply the function to both columns
df['CHQ'] = df['CHQ'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

In [8]:
# Remove extremely short or extremely long sentences
df = df[df['CHQ'].apply(lambda x: len(x.split()) >= 3)]
df = df[df['Summary'].apply(lambda x: len(x.split()) >= 3)]

df = df[df['CHQ'].apply(lambda x: len(x.split()) <= 80)]
df = df[df['Summary'].apply(lambda x: len(x.split()) <= 80)]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 787 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   File     787 non-null    object
 1   CHQ      787 non-null    object
 2   Summary  787 non-null    object
dtypes: object(3)
memory usage: 24.6+ KB


26 rows are removed after cleaning.

In [10]:
from nltk.corpus import stopwords

In [11]:
# Remove stopwords which won't be informative for the model
stop_words = set(stopwords.words("english"))
df['CHQ'] = df['CHQ'].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))
df['Summary'] = df['Summary'].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))

In [12]:
df.head()

Unnamed: 0,File,CHQ,Summary
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine
2,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy
3,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome
4,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost
5,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas


## Step 2: Round-Trip Translation

### Translation using a machine translation model

In [13]:
# Using pre-trained MarianMT model for translation
from transformers import MarianMTModel, MarianTokenizer

In [14]:
# Load pretrained model and tokenizer for the translation
def load_translation_model(source: str, dest: str)-> tuple:
    
    model_name = f"Helsinki-NLP/opus-mt-{source}-{dest}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    
    model = MarianMTModel.from_pretrained(model_name)
   
    
    return model, tokenizer

In [15]:
# Function to Translate to Pivot Languages and Back
def translate(text: list, model: MarianMTModel, tokenizer: MarianTokenizer)-> str:
    
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated = model.generate(**tokens)
    
    return tokenizer.decode(translated[0], skip_special_tokens=True)

In [16]:
# Setting up the computer to use the GPU
import torch

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Running on: {device}")

Running on: mps


In [17]:
rtt_questions = []
languages = ['es', 'de', 'it', 'zh', 'fr']

counter = 1
for question in df['CHQ']:
   
    print(f"Processing question {counter}...")
    counter += 1
    paraphrases = []
    for lang in languages:
        
        # Load models for each pivot language dynamically
        forward_model, forward_tokenizer = load_translation_model("en", lang)
        backward_model, backward_tokenizer = load_translation_model(lang, "en")

        # Forward and backward translation
        translated_text = translate(question, forward_model, forward_tokenizer)
        round_trip_text = translate(translated_text, backward_model, backward_tokenizer)

        paraphrases.append(round_trip_text)
    rtt_questions.append(paraphrases)

df['CHQ_paraphrases'] = rtt_questions

Processing question 1...
Processing question 2...
Processing question 3...
Processing question 4...
Processing question 5...
Processing question 6...
Processing question 7...
Processing question 8...
Processing question 9...
Processing question 10...
Processing question 11...
Processing question 12...
Processing question 13...
Processing question 14...
Processing question 15...
Processing question 16...
Processing question 17...
Processing question 18...
Processing question 19...
Processing question 20...
Processing question 21...
Processing question 22...
Processing question 23...
Processing question 24...
Processing question 25...
Processing question 26...
Processing question 27...
Processing question 28...
Processing question 29...
Processing question 30...
Processing question 31...
Processing question 32...
Processing question 33...
Processing question 34...
Processing question 35...
Processing question 36...
Processing question 37...
Processing question 38...
Processing question 3

In [24]:
print("Original Question: ",df.CHQ[0])
print("After translating to five given languages: ",df.CHQ_paraphrases[0])

Original Question:  subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent
After translating to five given languages:  ['subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent', 'Subject get cetirizine message need to know manufactures cetirizine walmart looking for new care get new', 'subject get cetirillina message needy know manufacturers cetirillina Walmart looking for new ever recent supply', 'Retrieving cellirizine information needs to know that new supplies are being sought.', 'topic get cetirizine message need to know manufscturs cetirizine walmart search new offer get recent']


### Translate using Google Translate API

In [18]:
from deep_translator import GoogleTranslator

In [28]:
def google(question, pivot_language):
    
    translated = GoogleTranslator(source='en', target=pivot_language).translate(question)
    back_translated = GoogleTranslator(source=pivot_language, target='en').translate(translated)
    return back_translated

google_languages = ['es', 'de', 'it', 'zh-CN', 'fr']

df['CHQ_google_paraphrases'] = df['CHQ'].apply(lambda x: [google(x, lang) for lang in google_languages])


In [29]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_paraphrases,CHQ_google_paraphrases
0,1-131188152.xml.txt,subject get cetirizine message needwant know m...,manufactures cetirizine,[subject get cetirizine message needwant know ...,[Subject Get Cetirizine Message Need Want To K...
2,1-131985747.xml.txt,subject nulytely message hello tell order nuly...,makes nulytely buy,[Nulytely issue message hello say order nulyte...,[Subject nulllytely message hello tell me orde...
3,15410.txt,williams syndrome would like daughter tested w...,get genetic testing williams syndrome,[Williams syndrome as a daughter williams synd...,[Williams Syndrome I would like my daughter to...
4,35.txt,clinicaltrialsgov question general information...,get genetic testing multiple myeloma cost,[clinicaltrialsgov asks general information de...,[Question from Clinicaltrialsgov General infor...
5,21.txt,genetic test ihhs heart condition commercial g...,get genetic testing ihss texas,[genetic test ihhs heart disease commercial ge...,[IHHS Heart Disease Genetic Testing Commercial...


In [31]:
print("Translation by pretrained model: ",df.CHQ_paraphrases[0])
print("Translation by Google Translate: ",df.CHQ_google_paraphrases[0])

Translation by pretrained model:  ['subject get cetirizine message needwant know manufscturs cetirizine walmart looking new supply getting recent', 'Subject get cetirizine message need to know manufactures cetirizine walmart looking for new care get new', 'subject get cetirillina message needy know manufacturers cetirillina Walmart looking for new ever recent supply', 'Retrieving cellirizine information needs to know that new supplies are being sought.', 'topic get cetirizine message need to know manufscturs cetirizine walmart search new offer get recent']
Translation by Google Translate:  ['Subject Get Cetirizine Message Need Want To Know Manufacturers Cetirizine Walmart Looking For New Supply Get Recent', 'Subject: Cetirizine received message needwant to know manufacturer Cetirizine Walmart looking for new offer always up to date', 'object get cetirizine message needwant to know manufacturers cetirizine walmart looking new supply get recent', 'Topics Get Cetirizine Information Need t