In [35]:
import pandas as pd
import glob
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from datetime import datetime
import math
# Supress warnings
import warnings
warnings.filterwarnings('ignore')

In [36]:
df = pd.read_csv("OHAS_Dataset.csv")
df.head()

Unnamed: 0,Disease,Disease_CUI,Symptoms,Symptom_CUI,Weight,Height,Intensity,Severity,Age,Gender,BMI_Level,Region,Season
0,influenza,C0162565,uncoordi162tion,C0039239,68,180,high,medium,24,female,27.9,southwest,Summer
1,influenza,C0162565,fever,C0000737,68,170,low,medium,23,male,33.77,southeast,Summer
2,influenza,C0162565,pleuritic pain,C0235704,68,162,low,low,24,male,33.0,southeast,Summer
3,influenza,C0162565,snuffle,C0030554,68,162,high,medium,34,male,22.705,northwest,Summer
4,influenza,C0162565,throat sore,C0030552,68,185,low,high,21,male,28.88,northwest,Winter


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Disease      2129 non-null   object 
 1   Disease_CUI  2119 non-null   object 
 2   Symptoms     2127 non-null   object 
 3   Symptom_CUI  2129 non-null   object 
 4   Weight       2129 non-null   int64  
 5   Height       2129 non-null   int64  
 6   Intensity    2127 non-null   object 
 7   Severity     2128 non-null   object 
 8   Age          2129 non-null   int64  
 9   Gender       2129 non-null   object 
 10  BMI_Level    2129 non-null   float64
 11  Region       2129 non-null   object 
 12  Season       2129 non-null   object 
dtypes: float64(1), int64(3), object(9)
memory usage: 216.4+ KB


In [38]:
# To keep only the diseases that we are interested in
Diseases_to_keep = ['acquired immuno-deficiency syndrome', 'arthritis', 'diabetes', 'gastroenteritis', 'hepatitis', 'hepatitis B',
'hepatitis C', 'hemorrhoids', 'hypertension pulmo162ry', 'hypothyroidism', 'gastroesophageal reflux disease', 'hypoglycemia', 'asthma',
'migraine disorders', 'pneumonia', 'infection uri162ry tract', 'degenerative polyarthritis']
# Filter the DataFrame to only include the diseases in the list
filtered_df = df[df['Disease'].isin(Diseases_to_keep)]

In [39]:
# Mapping of old names to new names
Disease_mapping = {
    'degenerative polyarthritis': 'osteoarthritis',
    'infection uri162ry tract': 'urinary tract infection',
    'hepatitis': 'hepatitis A',
    'acquired immuno-deficiency syndrome': 'AIDS',
    'hypertension pulmo162ry': 'hypertension',
    'gastroesophageal reflux disease': 'GERD',
    'migraine disorders': 'migraine'
}

# Replace the diseases with the new names
filtered_df['Disease'] = filtered_df['Disease'].replace(Disease_mapping)

In [40]:
filtered_df.head()

Unnamed: 0,Disease,Disease_CUI,Symptoms,Symptom_CUI,Weight,Height,Intensity,Severity,Age,Gender,BMI_Level,Region,Season
95,hypothyroidism,C0024790,shortness of breath,C0019054,398,162,low,low,23,female,37.62,southeast,Summer
96,hypothyroidism,C0024790,prostatism,C0008031,398,187,low,high,24,female,30.8,southwest,Summer
97,hypothyroidism,C0024790,drowsiness,C0027497,398,187,high,medium,28,male,38.28,southeast,Summer
98,hypothyroidism,C0024790,sleepy,C0011168,398,187,low,high,30,male,19.95,northeast,Summer
99,hypothyroidism,C0024790,hypo162tremia,C0242350,398,190,low,high,24,male,19.3,southwest,Winter


In [41]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 219 entries, 95 to 2128
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Disease      219 non-null    object 
 1   Disease_CUI  219 non-null    object 
 2   Symptoms     218 non-null    object 
 3   Symptom_CUI  219 non-null    object 
 4   Weight       219 non-null    int64  
 5   Height       219 non-null    int64  
 6   Intensity    219 non-null    object 
 7   Severity     219 non-null    object 
 8   Age          219 non-null    int64  
 9   Gender       219 non-null    object 
 10  BMI_Level    219 non-null    float64
 11  Region       219 non-null    object 
 12  Season       219 non-null    object 
dtypes: float64(1), int64(3), object(9)
memory usage: 24.0+ KB


In [42]:
# CHeck for duplicates
filtered_df.duplicated().sum()

0

In [43]:
# Check for missing values by column
filtered_df.isna().sum()

Disease        0
Disease_CUI    0
Symptoms       1
Symptom_CUI    0
Weight         0
Height         0
Intensity      0
Severity       0
Age            0
Gender         0
BMI_Level      0
Region         0
Season         0
dtype: int64

In [44]:
# Drop rows with missing values
filtered_df.dropna(inplace=True)

In [45]:
# Another check for missing values by column
filtered_df.isna().sum()

Disease        0
Disease_CUI    0
Symptoms       0
Symptom_CUI    0
Weight         0
Height         0
Intensity      0
Severity       0
Age            0
Gender         0
BMI_Level      0
Region         0
Season         0
dtype: int64

In [46]:
#Rechecking missing values
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 218 entries, 95 to 2128
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Disease      218 non-null    object 
 1   Disease_CUI  218 non-null    object 
 2   Symptoms     218 non-null    object 
 3   Symptom_CUI  218 non-null    object 
 4   Weight       218 non-null    int64  
 5   Height       218 non-null    int64  
 6   Intensity    218 non-null    object 
 7   Severity     218 non-null    object 
 8   Age          218 non-null    int64  
 9   Gender       218 non-null    object 
 10  BMI_Level    218 non-null    float64
 11  Region       218 non-null    object 
 12  Season       218 non-null    object 
dtypes: float64(1), int64(3), object(9)
memory usage: 23.8+ KB


In [47]:
filtered_df["Disease"].nunique()

17

In [48]:
# To count unique values for all the columns
filtered_df.nunique()

Disease         17
Disease_CUI     17
Symptoms       130
Symptom_CUI    144
Weight          17
Height          37
Intensity        3
Severity         3
Age             25
Gender           2
BMI_Level      177
Region           4
Season           2
dtype: int64

In [49]:
# To count the number of females and males in the dataset
filtered_df.Gender.value_counts()

Gender
female    121
male       97
Name: count, dtype: int64

In [50]:
# To count the number of diseases in each region
disease_count_by_region = filtered_df.groupby('Region')['Disease'].count()
print(disease_count_by_region)

Region
northeast    49
northwest    50
southeast    75
southwest    44
Name: Disease, dtype: int64


In [51]:
filtered_df.to_csv("OHAS_Cleaned_Dataset.csv", index=False, header=True)