In [169]:
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

SEED = 42
pd.set_option('display.max_colwidth', None)

## Load and view data

In [170]:
path = 'data/ai-medical-chatbot-data.csv'
df = pd.read_csv(path)

In [171]:
df.sample(5, random_state=SEED)

Unnamed: 0,short_question,short_answer,tags,label
30265,what is a nervous breakdown,nervous breakdown is not a clinical term so it does not mean anything specific to doctors people may use nervous breakdown to describe an episode of severe emotional distress the word “breakdown” may suggest the person showed unusual behavior or stopped performing his or her expected duties at home or work people described as having a nervous breakdown may or may not meet the criteria for a formal psychiatric diagnosis,['mental breakdown'],1.0
24640,i get very paranoid about allergies and death please help,do not be so obsessive about it in last 20 years i have hardly seen any case of anaphylaxis due to allergy its that rare so calm down you wont get anaphylaxis,['paranoia'],1.0
13125,my husband took a 25 mg imitrex for a really bad headache over three hours ago can he also take vicodin,hi start with the ringing well that could be tinnitus have a look on www earthclnic com its all natural stuff you may find some help there if you think you ears are clogged try some warm olive oil in them its worked for me a couple of times as fore smoking did smoke form the age of 11 till 1996 have just resently had a bit of tinnitus but i put this down to using a cutting disc next to my head as i was up a ladder your find tinnitus comes from lots of things load music in ear phones being the worst do hope this helps you good luck,['headache' 'vitamin a'],-1.0
41494,my wife is 72 and was diagnosed with afib 3 years ago is it advisable for her to be on a statin medication now her health has been good in general except for age related issues like arthritis and some hearing loss she has never had any heart problems before the afib never any high blood pressure high cholesterol or anything else her last cholesterol level was 186 triglyceride 128 hdl 68 non hdl cholesterol 118 and ldl 92 i would like to know what the recommendation is for a person with this profile should they be taking a statin medication as a prophylactic measure,hi her numbers are good and keep as far away from statins as she can get she sounds really healthy statins could reverse that if your not up on statins just go to www spacdoc com its a site all by doctors even the site owner is a doctor makes for some very interesting reading ok good luck,['arthritis' 'hearing loss' 'heart' 'high blood pressure' 'cholesterol'],1.0
29503,im having lower back pains and im having pains in my pelvic area and i am leaking urine for no reason i have to use the restroom about every hour i feel sick every time i wake up i have no idea whats going on i have never had a kid before i dont know what kind of doctor i need to see,mupiricin is an antibiotic used for skin infections it will not help poison ivy sorry,['pelvic area' 'nausea' 'vision' 'pain' 'back pain'],-1.0


# Preprocessing

#### Convert the tags column from string to list

In [172]:
def convert_col_to_list(df : pd.DataFrame, column_name : str) -> pd.DataFrame:
	
	# Iterate through each row in the DataFrame
	for i in range(len(df)):  
		value = df.at[i, column_name]
		
		# Remove square brackets from the string (if present)
		value_without_brackets = value.replace('[', '').replace(']', '')
		
		# Split the cleaned string into a list by spaces
		value_as_list = value_without_brackets.split("' '")

		# Remove any remaining single quotes from the start and end
		value_as_list[0] = value_as_list[0].lstrip("'")
		value_as_list[-1] = value_as_list[-1].rstrip("'")

		# Remove commas from values
		cleaned_value_as_list = [val.replace(',', '') for val in value_as_list]
		
		# Remove empty strings and convert label to lowercase
		non_empty_cleaned_value_as_list = [val.lower() for val in cleaned_value_as_list if val != '']

		df.at[i, column_name] = non_empty_cleaned_value_as_list
	
	return df


df = convert_col_to_list(df, 'tags')

In [173]:
df.sample(5, random_state=SEED)

Unnamed: 0,short_question,short_answer,tags,label
30265,what is a nervous breakdown,nervous breakdown is not a clinical term so it does not mean anything specific to doctors people may use nervous breakdown to describe an episode of severe emotional distress the word “breakdown” may suggest the person showed unusual behavior or stopped performing his or her expected duties at home or work people described as having a nervous breakdown may or may not meet the criteria for a formal psychiatric diagnosis,[mental breakdown],1.0
24640,i get very paranoid about allergies and death please help,do not be so obsessive about it in last 20 years i have hardly seen any case of anaphylaxis due to allergy its that rare so calm down you wont get anaphylaxis,[paranoia],1.0
13125,my husband took a 25 mg imitrex for a really bad headache over three hours ago can he also take vicodin,hi start with the ringing well that could be tinnitus have a look on www earthclnic com its all natural stuff you may find some help there if you think you ears are clogged try some warm olive oil in them its worked for me a couple of times as fore smoking did smoke form the age of 11 till 1996 have just resently had a bit of tinnitus but i put this down to using a cutting disc next to my head as i was up a ladder your find tinnitus comes from lots of things load music in ear phones being the worst do hope this helps you good luck,"[headache, vitamin a]",-1.0
41494,my wife is 72 and was diagnosed with afib 3 years ago is it advisable for her to be on a statin medication now her health has been good in general except for age related issues like arthritis and some hearing loss she has never had any heart problems before the afib never any high blood pressure high cholesterol or anything else her last cholesterol level was 186 triglyceride 128 hdl 68 non hdl cholesterol 118 and ldl 92 i would like to know what the recommendation is for a person with this profile should they be taking a statin medication as a prophylactic measure,hi her numbers are good and keep as far away from statins as she can get she sounds really healthy statins could reverse that if your not up on statins just go to www spacdoc com its a site all by doctors even the site owner is a doctor makes for some very interesting reading ok good luck,"[arthritis, hearing loss, heart, high blood pressure, cholesterol]",1.0
29503,im having lower back pains and im having pains in my pelvic area and i am leaking urine for no reason i have to use the restroom about every hour i feel sick every time i wake up i have no idea whats going on i have never had a kid before i dont know what kind of doctor i need to see,mupiricin is an antibiotic used for skin infections it will not help poison ivy sorry,"[pelvic area, nausea, vision, pain, back pain]",-1.0


#### Converting the questions and answers to lowercase

In [174]:
def to_lowercase(df: pd.DataFrame, cols) -> pd.DataFrame:
    for col in cols:
        df[col] = df[col].str.lower()
    return df
    
df = to_lowercase(df, ['short_question', 'short_answer'])

In [175]:
df.sample(5, random_state=SEED)

Unnamed: 0,short_question,short_answer,tags,label
30265,what is a nervous breakdown,nervous breakdown is not a clinical term so it does not mean anything specific to doctors people may use nervous breakdown to describe an episode of severe emotional distress the word “breakdown” may suggest the person showed unusual behavior or stopped performing his or her expected duties at home or work people described as having a nervous breakdown may or may not meet the criteria for a formal psychiatric diagnosis,[mental breakdown],1.0
24640,i get very paranoid about allergies and death please help,do not be so obsessive about it in last 20 years i have hardly seen any case of anaphylaxis due to allergy its that rare so calm down you wont get anaphylaxis,[paranoia],1.0
13125,my husband took a 25 mg imitrex for a really bad headache over three hours ago can he also take vicodin,hi start with the ringing well that could be tinnitus have a look on www earthclnic com its all natural stuff you may find some help there if you think you ears are clogged try some warm olive oil in them its worked for me a couple of times as fore smoking did smoke form the age of 11 till 1996 have just resently had a bit of tinnitus but i put this down to using a cutting disc next to my head as i was up a ladder your find tinnitus comes from lots of things load music in ear phones being the worst do hope this helps you good luck,"[headache, vitamin a]",-1.0
41494,my wife is 72 and was diagnosed with afib 3 years ago is it advisable for her to be on a statin medication now her health has been good in general except for age related issues like arthritis and some hearing loss she has never had any heart problems before the afib never any high blood pressure high cholesterol or anything else her last cholesterol level was 186 triglyceride 128 hdl 68 non hdl cholesterol 118 and ldl 92 i would like to know what the recommendation is for a person with this profile should they be taking a statin medication as a prophylactic measure,hi her numbers are good and keep as far away from statins as she can get she sounds really healthy statins could reverse that if your not up on statins just go to www spacdoc com its a site all by doctors even the site owner is a doctor makes for some very interesting reading ok good luck,"[arthritis, hearing loss, heart, high blood pressure, cholesterol]",1.0
29503,im having lower back pains and im having pains in my pelvic area and i am leaking urine for no reason i have to use the restroom about every hour i feel sick every time i wake up i have no idea whats going on i have never had a kid before i dont know what kind of doctor i need to see,mupiricin is an antibiotic used for skin infections it will not help poison ivy sorry,"[pelvic area, nausea, vision, pain, back pain]",-1.0


#### Remove symbols and special characters from text

In [176]:
import re
def remove_non_alphanumeric(df: pd.DataFrame, cols) -> pd.DataFrame:
    
	for index, row in df.iterrows():
		for col in cols:
			df.at[index, col] = re.sub(r'[^\w]', ' ', df.at[index, col])

	return df
    
df = remove_non_alphanumeric(df, ['short_question', 'short_answer'])

In [177]:
df.sample(5, random_state=SEED)

Unnamed: 0,short_question,short_answer,tags,label
30265,what is a nervous breakdown,nervous breakdown is not a clinical term so it does not mean anything specific to doctors people may use nervous breakdown to describe an episode of severe emotional distress the word breakdown may suggest the person showed unusual behavior or stopped performing his or her expected duties at home or work people described as having a nervous breakdown may or may not meet the criteria for a formal psychiatric diagnosis,[mental breakdown],1.0
24640,i get very paranoid about allergies and death please help,do not be so obsessive about it in last 20 years i have hardly seen any case of anaphylaxis due to allergy its that rare so calm down you wont get anaphylaxis,[paranoia],1.0
13125,my husband took a 25 mg imitrex for a really bad headache over three hours ago can he also take vicodin,hi start with the ringing well that could be tinnitus have a look on www earthclnic com its all natural stuff you may find some help there if you think you ears are clogged try some warm olive oil in them its worked for me a couple of times as fore smoking did smoke form the age of 11 till 1996 have just resently had a bit of tinnitus but i put this down to using a cutting disc next to my head as i was up a ladder your find tinnitus comes from lots of things load music in ear phones being the worst do hope this helps you good luck,"[headache, vitamin a]",-1.0
41494,my wife is 72 and was diagnosed with afib 3 years ago is it advisable for her to be on a statin medication now her health has been good in general except for age related issues like arthritis and some hearing loss she has never had any heart problems before the afib never any high blood pressure high cholesterol or anything else her last cholesterol level was 186 triglyceride 128 hdl 68 non hdl cholesterol 118 and ldl 92 i would like to know what the recommendation is for a person with this profile should they be taking a statin medication as a prophylactic measure,hi her numbers are good and keep as far away from statins as she can get she sounds really healthy statins could reverse that if your not up on statins just go to www spacdoc com its a site all by doctors even the site owner is a doctor makes for some very interesting reading ok good luck,"[arthritis, hearing loss, heart, high blood pressure, cholesterol]",1.0
29503,im having lower back pains and im having pains in my pelvic area and i am leaking urine for no reason i have to use the restroom about every hour i feel sick every time i wake up i have no idea whats going on i have never had a kid before i dont know what kind of doctor i need to see,mupiricin is an antibiotic used for skin infections it will not help poison ivy sorry,"[pelvic area, nausea, vision, pain, back pain]",-1.0


#### Converting label to binary

In [178]:
def convert_label_to_binary(df: pd.DataFrame) -> pd.DataFrame:
    df['label'] = df['label'].map({-1: 0, 1: 1})
    return df

df = convert_label_to_binary(df)

In [179]:
df.sample(5, random_state=SEED)

Unnamed: 0,short_question,short_answer,tags,label
30265,what is a nervous breakdown,nervous breakdown is not a clinical term so it does not mean anything specific to doctors people may use nervous breakdown to describe an episode of severe emotional distress the word breakdown may suggest the person showed unusual behavior or stopped performing his or her expected duties at home or work people described as having a nervous breakdown may or may not meet the criteria for a formal psychiatric diagnosis,[mental breakdown],1
24640,i get very paranoid about allergies and death please help,do not be so obsessive about it in last 20 years i have hardly seen any case of anaphylaxis due to allergy its that rare so calm down you wont get anaphylaxis,[paranoia],1
13125,my husband took a 25 mg imitrex for a really bad headache over three hours ago can he also take vicodin,hi start with the ringing well that could be tinnitus have a look on www earthclnic com its all natural stuff you may find some help there if you think you ears are clogged try some warm olive oil in them its worked for me a couple of times as fore smoking did smoke form the age of 11 till 1996 have just resently had a bit of tinnitus but i put this down to using a cutting disc next to my head as i was up a ladder your find tinnitus comes from lots of things load music in ear phones being the worst do hope this helps you good luck,"[headache, vitamin a]",0
41494,my wife is 72 and was diagnosed with afib 3 years ago is it advisable for her to be on a statin medication now her health has been good in general except for age related issues like arthritis and some hearing loss she has never had any heart problems before the afib never any high blood pressure high cholesterol or anything else her last cholesterol level was 186 triglyceride 128 hdl 68 non hdl cholesterol 118 and ldl 92 i would like to know what the recommendation is for a person with this profile should they be taking a statin medication as a prophylactic measure,hi her numbers are good and keep as far away from statins as she can get she sounds really healthy statins could reverse that if your not up on statins just go to www spacdoc com its a site all by doctors even the site owner is a doctor makes for some very interesting reading ok good luck,"[arthritis, hearing loss, heart, high blood pressure, cholesterol]",1
29503,im having lower back pains and im having pains in my pelvic area and i am leaking urine for no reason i have to use the restroom about every hour i feel sick every time i wake up i have no idea whats going on i have never had a kid before i dont know what kind of doctor i need to see,mupiricin is an antibiotic used for skin infections it will not help poison ivy sorry,"[pelvic area, nausea, vision, pain, back pain]",0


#### Removing stopwords

In [180]:
stop_words = set(stopwords.words('english'))

# View some samples of stopwords to see what they look like
print(list(stop_words)[::10])

['just', 'few', 'he', 'mightn', 'it', 'we', 'any', 'each', 'll', "it's", 'all', 'an', 'doesn', "won't", 'over', 'my', 'at', 'whom']


In [181]:
def remove_stopwords(df: pd.DataFrame, cols) -> pd.DataFrame:
    
	count_removed = 0

	stop_words = set(stopwords.words('english'))

	for i, row in df.iterrows():
		for col in cols:

			# Get list of tokens in sentence
			sentence_tokens = word_tokenize(df.at[i, col])

			sentence_tokens_without_stopwords = []
			for token in sentence_tokens:
				if token not in stop_words:
					sentence_tokens_without_stopwords.append(token)
				else:
					count_removed += 1

			# Join list using spaces
			df.at[i, col] = " ".join(sentence_tokens_without_stopwords)

	print(f"Removed {count_removed} stopwords from the dataset")

	return df

df = remove_stopwords(df, ['short_question', 'short_answer'])

Removed 2863411 stopwords from the dataset


In [182]:
df.sample(5, random_state=SEED)

Unnamed: 0,short_question,short_answer,tags,label
30265,nervous breakdown,nervous breakdown clinical term mean anything specific doctors people may use nervous breakdown describe episode severe emotional distress word breakdown may suggest person showed unusual behavior stopped performing expected duties home work people described nervous breakdown may may meet criteria formal psychiatric diagnosis,[mental breakdown],1
24640,get paranoid allergies death please help,obsessive last 20 years hardly seen case anaphylaxis due allergy rare calm wont get anaphylaxis,[paranoia],1
13125,husband took 25 mg imitrex really bad headache three hours ago also take vicodin,hi start ringing well could tinnitus look www earthclnic com natural stuff may find help think ears clogged try warm olive oil worked couple times fore smoking smoke form age 11 till 1996 resently bit tinnitus put using cutting disc next head ladder find tinnitus comes lots things load music ear phones worst hope helps good luck,"[headache, vitamin a]",0
41494,wife 72 diagnosed afib 3 years ago advisable statin medication health good general except age related issues like arthritis hearing loss never heart problems afib never high blood pressure high cholesterol anything else last cholesterol level 186 triglyceride 128 hdl 68 non hdl cholesterol 118 ldl 92 would like know recommendation person profile taking statin medication prophylactic measure,hi numbers good keep far away statins get sounds really healthy statins could reverse statins go www spacdoc com site doctors even site owner doctor makes interesting reading ok good luck,"[arthritis, hearing loss, heart, high blood pressure, cholesterol]",1
29503,im lower back pains im pains pelvic area leaking urine reason use restroom every hour feel sick every time wake idea whats going never kid dont know kind doctor need see,mupiricin antibiotic used skin infections help poison ivy sorry,"[pelvic area, nausea, vision, pain, back pain]",0


#### Saving Dataframe to file

In [None]:
df.to_csv('preprocessed-ai-medical-chatbot-data.csv', index=False)  