## Import Libraries

In [15]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

SEED = 42
pd.set_option('display.max_colwidth', None)

## Load and view data

In [20]:
path = 'data/ai-medical-chatbot.csv'
df = pd.read_csv(path)

In [17]:
df.sample(5, random_state=SEED)

ValueError: Cannot take a larger sample than population when 'replace=False'

## Data Exploration

#### Shape of the dataset

In [18]:
print(f"Number of rows is {df.shape[0]}")
print(f"Number of columns is {df.shape[1]}")

Number of rows is 2
Number of columns is 1


#### Bar Plot of the Most Commonly Occuring Symptoms

#### Distribution of description length

In [19]:
# Obtain the length of each Description, counting words by splitting on spaces
description_word_count = df['Description'].apply(lambda x: len(x.split()))

# Create the histogram
sns.histplot(description_word_count, bins=100)
plt.title('Distribution of Word Count in Descriptions')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

min_length = 0
max_length = 40
plt.xlim(min_length, max_length) 

plt.show()


KeyError: 'Description'

In [6]:
outside_range_count = 0
for count in description_word_count:
    if count < min_length or count > max_length:
        outside_range_count += 1
        
print(f"There are {outside_range_count} desctiptions that are less than {min_length} and more than {max_length} words")

NameError: name 'description_word_count' is not defined

It can be seen that most of the questions are relatively short and have a length that is under 40 words with most lying between 0 and 20 words. Having a lot of short descriptions is helpful since it will easy the amount of computation and make it easier to match the user queries against the descriptions to obtain the closest match.

#### Distribution of Patient Question length

In [7]:
# Obtain the length of each Patient Question, counting words by splitting on spaces
patient_word_count = df['Patient'].apply(lambda x: len(x.split()))

# Create the histogram
sns.histplot(patient_word_count, bins=200, kde=True)
plt.title('Distribution of Word Count in Patient Questions')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

min_length = 0
max_length = 300
plt.xlim(min_length, max_length) 

plt.show()

KeyError: 'Patient'

In [8]:
outside_range_count = 0
for count in patient_word_count:
    if count < min_length or count > max_length:
        outside_range_count += 1
        
print(f"There are {outside_range_count} patient questions that are less than {min_length} and more than {max_length} words")

NameError: name 'patient_word_count' is not defined

The patient questions are much longer than their descriptions. It can be seen that most of the questions asked by the patients tend to lie between 50 and 100 words

#### Distribution of Doctor Answer

In [23]:
# Obtain the length of each Patient Question, counting words by splitting on spaces
doctor_word_count = df['Doctor'].apply(lambda x: len(x.split()))

# Create the histogram
sns.histplot(doctor_word_count, bins=200, kde=True)
plt.title('Distribution of Word Count in Doctor Answers')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

min_length = 0
max_length = 300
plt.xlim(min_length, max_length) 

plt.show()

KeyError: 'Doctor'

In [None]:
outside_range_count = 0
for count in doctor_word_count:
    if count < min_length or count > max_length:
        outside_range_count += 1
        
print(f"There are {outside_range_count} patient questions that are less than {min_length} and more than {max_length} words")

NameError: name 'doctor_word_count' is not defined

The doctor's answers to the patient's queries are understandably longer. The above disctibution seems close to being normally distributed. Most responses seem to lie in the range of 50-150 words. A peak in response length can be seen around 60 words. This will help our AI chatbot produce detailed answers.

Bar Plot (Distribution of Labels)

In [None]:


label_counts = df['Label'].value_counts()
plt.figure(figsize=(8, 4))
label_counts.plot(kind='bar')
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.grid(True)
plt.show()

KeyError: 'Label'

Scatter Plot (response length vs label)

In [None]:
#relationship between label distribution and other feature (i.e length of response)
df['Response Length'] = df['Response'].apply(len)

plt.figure(figsize=(10, 5))
plt.scatter(df['Response Length'], df['Label'], alpha=0.5)
plt.title('Scatter plot of Response Length vs Label')
plt.xlabel('Response Length')
plt.ylabel('Label')
plt.grid(True)
plt.show()

KeyError: 'Response'