# Implementation

## Step 1: Dataset Prepration

In [1]:
import pandas as pd

### Load the dataset

In [2]:
df = pd.read_excel("MeQSum_ACL2019_BenAbacha_Demner-Fushman.xlsx")

In [3]:
df.head()

Unnamed: 0,File,CHQ,Summary
0,1-131188152.xml.txt,SUBJECT: who and where to get cetirizine - D\n...,Who manufactures cetirizine?
1,14348.txt,who makes bromocriptine\ni am wondering what c...,Who manufactures bromocriptine?
2,1-131985747.xml.txt,SUBJECT: nulytely\nMESSAGE: Hello can you tell...,"Who makes nulytely, and where can I buy it?"
3,15410.txt,Williams' syndrome\nI would like to have my da...,Where can I get genetic testing for william's ...
4,35.txt,ClinicalTrials.gov - Question - general inform...,Where can I get genetic testing for multiple m...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   File     1000 non-null   object
 1   CHQ      1000 non-null   object
 2   Summary  1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


### Data Preprocessing

In [5]:
import re

In [6]:
def clean_text(text):
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    
    return text

In [7]:
# Apply the function to both columns
df['CHQ'] = df['CHQ'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

In [8]:

# Remove extremely short or extremely long sentences

df = df[df['CHQ'].apply(lambda x: len(x.split()) >= 5)]
df = df[df['Summary'].apply(lambda x: len(x.split()) >= 5)]

df = df[df['CHQ'].apply(lambda x: len(x.split()) <= 300)]
df = df[df['Summary'].apply(lambda x: len(x.split()) <= 300)]


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 974 entries, 2 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   File     974 non-null    object
 1   CHQ      974 non-null    object
 2   Summary  974 non-null    object
dtypes: object(3)
memory usage: 30.4+ KB


In [10]:
from textblob import TextBlob

26 rows are removed after cleaning.

In [None]:
# Tokenize the text and add the tokens to the dataframe as separate columns
df['CHQ_tokens'] = df['CHQ'].apply(lambda text: TextBlob(text).words)
df['Summary_tokens'] = df['Summary'].apply(lambda text: TextBlob(text).words)

In [14]:
from nltk.corpus import stopwords

In [15]:
# Remove stopwords which won't be informative for the model
stop_words = set(stopwords.words("english"))
df['CHQ_tokens'] = df['CHQ_tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
df['Summary_tokens'] = df['Summary_tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [16]:
df.head()

Unnamed: 0,File,CHQ,Summary,CHQ_tokens,Summary_tokens
2,1-131985747.xml.txt,subject nulytely message hello can you tell me...,who makes nulytely and where can i buy it,"[subject, nulytely, message, hello, tell, orde...","[makes, nulytely, buy]"
3,15410.txt,williams syndrome i would like to have my daug...,where can i get genetic testing for williams s...,"[williams, syndrome, would, like, daughter, te...","[get, genetic, testing, williams, syndrome]"
4,35.txt,clinicaltrialsgov question general information...,where can i get genetic testing for multiple m...,"[clinicaltrialsgov, question, general, informa...","[get, genetic, testing, multiple, myeloma, cost]"
5,21.txt,genetic test for ihhs heart condition is there...,where can i get genetic testing for ihss in texas,"[genetic, test, ihhs, heart, condition, commer...","[get, genetic, testing, ihss, texas]"
6,1-136003557.xml.txt,subject friedreichs ataxia message i have been...,where can i get genetic testing for friedreich...,"[subject, friedreichs, ataxia, message, told, ...","[get, genetic, testing, friedreichs, treatments]"
