In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from google.colab import files

# Kaggle API Dataset:

In [None]:
! pip install -q kaggle

#Upload your json token file
files.upload()

#Getting the tokens authorised
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

#Check the kaggle api is working!
! kaggle datasets list

Saving kaggle.json to kaggle.json
ref                                                         title                                              size  lastUpdated          downloadCount  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  
gpreda/reddit-vaccine-myths                                 Reddit Vaccine Myths                              237KB  2021-12-12 11:59:54          18710  
crowww/a-large-scale-fish-dataset                           A Large Scale Fish Dataset                          3GB  2021-04-28 17:03:01          11365  
imsparsh/musicnet-dataset                                   MusicNet Dataset                                   22GB  2021-02-18 14:12:19           5832  
dhruvildave/wikibooks-dataset                               Wikibooks Dataset                                   2GB  2021-10-22 10:48:21           3989  
nickuzmenkov/nih-chest-xrays-tfrecords    

In [None]:
!kaggle datasets download itachi9604/disease-symptom-description-dataset -f dataset.csv
!kaggle datasets download itachi9604/disease-symptom-description-dataset -f Symptom-severity.csv

Downloading dataset.csv to /content
  0% 0.00/617k [00:00<?, ?B/s]
100% 617k/617k [00:00<00:00, 42.0MB/s]


In [None]:
raw_df = pd.read_csv('dataset.csv')
symptoms = pd.read_csv('Symptom-severity.csv')
symptoms.drop_duplicates(df['Symptom'])
raw_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


# Functions:

In [None]:
df = raw_df.copy()
SYM_COL = ['Symptom_1',	'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9']

In [None]:
def replacer(data_series, value):
  '''
  Input: a data_series with a multiple numeric values
  return: a data_series with a single numeric value
  '''
  data_series = np.where((data_series > 0), value, data_series)
  return data_series

In [None]:
#The binarizer is used to onehotencode the symptoms
encoded_dfs = {}
for col in SYM_COL:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col].fillna('missing').str.split(', ')),
                               columns=binarizer.classes_,
                               index=df[col].index)
    encoded_dfs[col] = encoded_df


#Creating an iterator to recieve the encoded values without a multi-index dataframe
#It's created with constant indes from the symptoms dataframe
#Then we add the values with the same name in both df and onehot using add on float numbers
#turn them into int values

onehot = pd.DataFrame(columns=symptoms['Symptom'])
first = True
for key, value in encoded_dfs.items():
    if first:
        onehot = value
        first = False
    else:
        onehot = onehot.add(value, fill_value=0).astype(int)

#Check all is performing as planned
onehot.sample(1)

In [None]:
#replace the numeric values into 0/1 only
for i in onehot:
  onehot[i] = replacer(onehot[i], 1)

#joining the onehot to the original dataframe with "Disease" column
#Drop the 17 columns of symptoms since they're no longer useful

df.drop(df.iloc[:, 1:19], inplace = True, axis = 1)
df = df.join(onehot, how='outer').drop('index', axis=1)

#Final check that it looks as planned
df.head()

Unnamed: 0,Disease,abdominal_pain,acidity,altered_sensorium,anxiety,back_pain,blackheads,bladder_discomfort,blister,bloody_stool,blurred_and_distorted_vision,breathlessness,brittle_nails,bruising,burning_micturition,chest_pain,chills,cold_hands_and_feets,constipation,continuous_feel_of_urine,continuous_sneezing,cough,cramps,dark_urine,dehydration,depression,diarrhoea,dischromic _patches,distention_of_abdomen,dizziness,drying_and_tingling_lips,enlarged_thyroid,excessive_hunger,extra_marital_contacts,family_history,fast_heart_rate,fatigue,fluid_overload,foul_smell_of urine,headache,...,puffy_face_and_eyes,pus_filled_pimples,red_sore_around_nose,restlessness,scurring,shivering,silver_like_dusting,skin_peeling,skin_rash,slurred_speech,small_dents_in_nails,spinning_movements,spotting_ urination,stiff_neck,stomach_pain,sunken_eyes,sweating,swelled_lymph_nodes,swelling_joints,swelling_of_stomach,swollen_blood_vessels,swollen_extremeties,swollen_legs,throat_irritation,toxic_look_(typhos),ulcers_on_tongue,unsteadiness,visual_disturbances,vomiting,watering_from_eyes,weakness_in_limbs,weakness_of_one_body_side,weight_gain,weight_loss,yellow_crust_ooze,yellow_urine,yellowing_of_eyes,yellowish_skin,itching,missing
0,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
1,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1


## Summed values model (Optional):

This is one step further into the process of cleaning the data where it'll have all possible symptoms linked to their diagnosis

In [None]:
res = df.groupby('Disease').sum()

Unnamed: 0_level_0,abdominal_pain,altered_sensorium,anxiety,blackheads,blister,bloody_stool,blurred_and_distorted_vision,breathlessness,bruising,burning_micturition,chest_pain,chills,cold_hands_and_feets,continuous_feel_of_urine,cough,dark_urine,dehydration,diarrhoea,dischromic _patches,dizziness,extra_marital_contacts,fatigue,foul_smell_of urine,headache,high_fever,hip_joint_pain,joint_pain,knee_pain,lethargy,loss_of_appetite,loss_of_balance,mood_swings,movement_stiffness,nausea,neck_pain,nodal_skin_eruptions,obesity,pain_in_anal_region,red_sore_around_nose,restlessness,scurring,silver_like_dusting,skin_peeling,spinning_movements,stomach_pain,sweating,swelling_joints,swelling_of_stomach,ulcers_on_tongue,vomiting,watering_from_eyes,weakness_of_one_body_side,weight_loss,yellowish_skin
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1
(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,102,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0
AIDS,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,0,0,0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Acne,0,0,0,90,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,30,0,0,0,0,0,0,0,0,0,0,0,0,0
Alcoholic hepatitis,102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0
Allergy,0,0,0,0,0,0,0,0,0,0,0,84,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0


In [None]:
res.shape

(41, 57)

In [None]:
for i in res:
  res[i] = replacer(res[i], 1)

In [None]:
res.to_pickle('OtherModelOption.pkl')

# Saving Cleaned Data

In [None]:
df.to_pickle('Cleaned.pkl')