In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from joblib import dump, load
import json

## Preprocessing

In [18]:
data=pd.read_csv("dataset.csv")

In [19]:
print(data.head())
print(data.shape)

            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

### Clear the symptom names

In [20]:
for col in data.columns: 
    data[col] = data[col].str.replace('_',' ')
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


In [21]:
data.fillna(0, inplace=True)

In [22]:
data

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,itching,skin rash,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,itching,skin rash,nodal skin eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning movements,loss of balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,skin rash,pus filled pimples,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,burning micturition,bladder discomfort,foul smell of urine,continuous feel of urine,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,skin rash,joint pain,skin peeling,silver like dusting,small dents in nails,inflammatory nails,0,0,0,0,0,0,0,0,0,0,0


In [23]:
data["list_of_symptoms"] = 0
for i in range(data.shape[0]):
    values = data.iloc[i].values
    values = values.tolist()
    data["list_of_symptoms"][i] = values[1:values.index(0)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["list_of_symptoms"][i] = values[1:values.index(0)]


In [24]:
print(data.head())

            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin rash   nodal skin eruptions   
1  Fungal infection   skin rash   nodal skin eruptions    dischromic  patches   
2  Fungal infection     itching   nodal skin eruptions    dischromic  patches   
3  Fungal infection     itching              skin rash    dischromic  patches   
4  Fungal infection     itching              skin rash   nodal skin eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic  patches         0         0         0         0         0   
1                     0         0         0         0         0         0   
2                     0         0         0         0         0         0   
3                     0         0         0         0         0         0   
4                     0         0         0         0         0         0   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

In [25]:
contains_zero = data['list_of_symptoms'].apply(lambda x: 0 in x)

print(data[contains_zero])

Empty DataFrame
Columns: [Disease, Symptom_1, Symptom_2, Symptom_3, Symptom_4, Symptom_5, Symptom_6, Symptom_7, Symptom_8, Symptom_9, Symptom_10, Symptom_11, Symptom_12, Symptom_13, Symptom_14, Symptom_15, Symptom_16, Symptom_17, list_of_symptoms]
Index: []


### Collect all the symptoms

In [26]:
all_symptoms = data[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17']].values.ravel()
print(all_symptoms)

['itching' ' skin rash' ' nodal skin eruptions' ... 0 0 0]


In [32]:
symptoms=pd.unique(all_symptoms)

mask = symptoms != 0
symptoms = symptoms[mask]

print(symptoms)
print(len(symptoms))

one_hot_data = pd.DataFrame(columns = symptoms,index = data.index)
one_hot_data['list_of_symptoms']=data['list_of_symptoms']

['itching' ' skin rash' ' nodal skin eruptions' ' dischromic  patches'
 ' continuous sneezing' ' shivering' ' chills' ' watering from eyes'
 ' stomach pain' ' acidity' ' ulcers on tongue' ' vomiting' ' cough'
 ' chest pain' ' yellowish skin' ' nausea' ' loss of appetite'
 ' abdominal pain' ' yellowing of eyes' ' burning micturition'
 ' spotting  urination' ' passage of gases' ' internal itching'
 ' indigestion' ' muscle wasting' ' patches in throat' ' high fever'
 ' extra marital contacts' ' fatigue' ' weight loss' ' restlessness'
 ' lethargy' ' irregular sugar level' ' blurred and distorted vision'
 ' obesity' ' excessive hunger' ' increased appetite' ' polyuria'
 ' sunken eyes' ' dehydration' ' diarrhoea' ' breathlessness'
 ' family history' ' mucoid sputum' ' headache' ' dizziness'
 ' loss of balance' ' lack of concentration' ' stiff neck' ' depression'
 ' irritability' ' visual disturbances' ' back pain' ' weakness in limbs'
 ' neck pain' ' weakness of one body side' ' altered sens

In [33]:
one_hot_data

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,foul smell of urine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,list_of_symptoms
0,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, nodal skin eruptions, ..."
1,,,,,,,,,,,...,,,,,,,,,,"[ skin rash, nodal skin eruptions, dischromi..."
2,,,,,,,,,,,...,,,,,,,,,,"[itching, nodal skin eruptions, dischromic ..."
3,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, dischromic patches]"
4,,,,,,,,,,,...,,,,,,,,,,"[itching, skin rash, nodal skin eruptions]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,,,,,,,,,,,...,,,,,,,,,,"[ vomiting, headache, nausea, spinning move..."
4916,,,,,,,,,,,...,,,,,,,,,,"[ skin rash, pus filled pimples, blackheads,..."
4917,,,,,,,,,,,...,,,,,,,,,,"[ burning micturition, bladder discomfort, f..."
4918,,,,,,,,,,,...,,,,,,,,,,"[ skin rash, joint pain, skin peeling, silv..."


### Transfer the data into one-hot format

In [34]:
# Fill the data
for i in one_hot_data:
    one_hot_data[i] = data.apply(lambda x:1 if i in x.list_of_symptoms else 0, axis=1)
one_hot_data

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,foul smell of urine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,list_of_symptoms
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4918,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,0


In [35]:
one_hot_data['Disease']=data['Disease']
one_hot_data=one_hot_data.drop('list_of_symptoms',axis=1)

  one_hot_data['Disease']=data['Disease']


In [36]:
one_hot_data.rename(columns=lambda x: x.strip(), inplace=True)

In [37]:
one_hot_data

Unnamed: 0,itching,skin rash,nodal skin eruptions,dischromic patches,continuous sneezing,shivering,chills,watering from eyes,stomach pain,acidity,...,foul smell of urine,continuous feel of urine,skin peeling,silver like dusting,small dents in nails,inflammatory nails,blister,red sore around nose,yellow crust ooze,Disease
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
4916,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Acne
4917,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,Urinary tract infection
4918,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,Psoriasis


### Check if we have the correct columns

In [42]:
symptoms_set = set(one_hot_data.columns)
print(symptoms_set)

{'prominent veins on calf', 'irritability', 'swollen extremeties', 'malaise', 'bloody stool', 'swelling of stomach', 'puffy face and eyes', 'rusty sputum', 'weight gain', 'blood in sputum', 'yellow urine', 'nausea', 'foul smell of urine', 'blackheads', 'dizziness', 'excessive hunger', 'depression', 'cold hands and feets', 'extra marital contacts', 'dehydration', 'anxiety', 'watering from eyes', 'spotting  urination', 'toxic look (typhos)', 'neck pain', 'receiving blood transfusion', 'shivering', 'cough', 'pain in anal region', 'diarrhoea', 'chest pain', 'stomach pain', 'itching', 'movement stiffness', 'unsteadiness', 'congestion', 'vomiting', 'yellowing of eyes', 'altered sensorium', 'brittle nails', 'headache', 'knee pain', 'internal itching', 'painful walking', 'abdominal pain', 'indigestion', 'increased appetite', 'irregular sugar level', 'fluid overload', 'continuous feel of urine', 'blister', 'muscle weakness', 'loss of appetite', 'constipation', 'history of alcohol consumption', 

## Training

In [45]:
X, y = train_test_split(one_hot_data, test_size=0.2, random_state=25)
X_train = X.drop("Disease",axis=1)
y_train = X["Disease"]
X_test = y.drop("Disease",axis=1)
y_test = y["Disease"]

In [46]:
RF_model = RandomForestClassifier()
RF_model.fit(X_train,y_train)

In [47]:
y_pred=RF_model.predict(X_test)

In [49]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(cross_val_score(RF_model,X_train,y_train,cv=10))

Accuracy: 1.0
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


### Save all symptoms in a text file

In [23]:
new_df = pd.DataFrame(X_test.iloc[302]).transpose()

print(new_df)

      itching  skin rash  nodal skin eruptions  dischromic  patches  \
1008        0          0                     0                    0   

      continuous sneezing  shivering  chills  watering from eyes  \
1008                    0          0       1                   0   

      stomach pain  acidity  ...  bladder discomfort  foul smell of urine  \
1008             0        0  ...                   0                    0   

      continuous feel of urine  skin peeling  silver like dusting  \
1008                         0             0                    0   

      small dents in nails  inflammatory nails  blister  red sore around nose  \
1008                     0                   0        0                     0   

      yellow crust ooze  
1008                  0  

[1 rows x 131 columns]


In [24]:
print(new_df.columns.tolist())

['itching', 'skin rash', 'nodal skin eruptions', 'dischromic  patches', 'continuous sneezing', 'shivering', 'chills', 'watering from eyes', 'stomach pain', 'acidity', 'ulcers on tongue', 'vomiting', 'cough', 'chest pain', 'yellowish skin', 'nausea', 'loss of appetite', 'abdominal pain', 'yellowing of eyes', 'burning micturition', 'spotting  urination', 'passage of gases', 'internal itching', 'indigestion', 'muscle wasting', 'patches in throat', 'high fever', 'extra marital contacts', 'fatigue', 'weight loss', 'restlessness', 'lethargy', 'irregular sugar level', 'blurred and distorted vision', 'obesity', 'excessive hunger', 'increased appetite', 'polyuria', 'sunken eyes', 'dehydration', 'diarrhoea', 'breathlessness', 'family history', 'mucoid sputum', 'headache', 'dizziness', 'loss of balance', 'lack of concentration', 'stiff neck', 'depression', 'irritability', 'visual disturbances', 'back pain', 'weakness in limbs', 'neck pain', 'weakness of one body side', 'altered sensorium', 'dark 

In [25]:
with open('symptoms.txt', 'w') as fp:
    for item in new_df.columns.tolist():
        fp.write("%s\n" % item)
    print('Done')

Done


In [50]:
predictions = RF_model.predict(new_df)
print(predictions)

['Typhoid']


## Exporting Modle

In [27]:
dump(RF_model, 'random_forest_model.joblib')

['random_forest_model.joblib']

In [28]:
loaded_rf_classifier = load('random_forest_model.joblib')

In [29]:
predictions = loaded_rf_classifier.predict(new_df)
print(predictions)

['Typhoid']


## Inferencing

In [30]:
symptoms_names = []

# open file and read the content in a list
with open('symptoms.txt', 'r') as fp:
    for line in fp:
        x = line[:-1]
        symptoms_names.append(x)

# display list
print(symptoms_names)

['itching', 'skin rash', 'nodal skin eruptions', 'dischromic  patches', 'continuous sneezing', 'shivering', 'chills', 'watering from eyes', 'stomach pain', 'acidity', 'ulcers on tongue', 'vomiting', 'cough', 'chest pain', 'yellowish skin', 'nausea', 'loss of appetite', 'abdominal pain', 'yellowing of eyes', 'burning micturition', 'spotting  urination', 'passage of gases', 'internal itching', 'indigestion', 'muscle wasting', 'patches in throat', 'high fever', 'extra marital contacts', 'fatigue', 'weight loss', 'restlessness', 'lethargy', 'irregular sugar level', 'blurred and distorted vision', 'obesity', 'excessive hunger', 'increased appetite', 'polyuria', 'sunken eyes', 'dehydration', 'diarrhoea', 'breathlessness', 'family history', 'mucoid sputum', 'headache', 'dizziness', 'loss of balance', 'lack of concentration', 'stiff neck', 'depression', 'irritability', 'visual disturbances', 'back pain', 'weakness in limbs', 'neck pain', 'weakness of one body side', 'altered sensorium', 'dark 

In [31]:
loaded_rf_classifier = load('random_forest_model.joblib')

In [32]:
name =  '{ "data":["itching", "skin rash", "nodal skin eruptions", "dischromic  patches", "continuous sneezing"]}'
json_object = json.loads(name)
print(type(json_object))
print(type(json_object["data"]))

test_list = ['itching', 'skin rash', 'nodal skin eruptions', 'dischromic  patches', 'continuous sneezing']
df = pd.DataFrame(columns=symptoms_names)
print(df)
new_row = []
for item in symptoms_names:
    if any(item in items for items in test_list):
        new_row.append(1)
    else:
        new_row.append(0) 
df.loc[len(df)] = new_row

print(df)

<class 'dict'>
<class 'list'>
Empty DataFrame
Columns: [itching, skin rash, nodal skin eruptions, dischromic  patches, continuous sneezing, shivering, chills, watering from eyes, stomach pain, acidity, ulcers on tongue, vomiting, cough, chest pain, yellowish skin, nausea, loss of appetite, abdominal pain, yellowing of eyes, burning micturition, spotting  urination, passage of gases, internal itching, indigestion, muscle wasting, patches in throat, high fever, extra marital contacts, fatigue, weight loss, restlessness, lethargy, irregular sugar level, blurred and distorted vision, obesity, excessive hunger, increased appetite, polyuria, sunken eyes, dehydration, diarrhoea, breathlessness, family history, mucoid sputum, headache, dizziness, loss of balance, lack of concentration, stiff neck, depression, irritability, visual disturbances, back pain, weakness in limbs, neck pain, weakness of one body side, altered sensorium, dark urine, sweating, muscle pain, mild fever, swelled lymph node

In [33]:
prediction = loaded_rf_classifier.predict(df)

In [34]:
print(prediction)

['Fungal infection']
