In [None]:
import pandas as pd
import glob

# Read all the csv files in the current directory
all_files = glob.glob('../data/*.csv')

for filename in all_files:
    df=pd.read_csv(filename)
    # get the name of the file
    name = filename.split('/')[-1].split('.')[0]
    print(f'{name}: has {len(df)} rows, {df.shape[1]} columns')


In [None]:
import pandas as pd

file = '../data/Symptom-severity.csv'

df = pd.read_csv(file)

# print(df.head())
# print the first column
print(df.iloc[:, 0].values)


In [None]:
import sys
sys.path.append('..')
sys.path.append('.')

from diagnostic_assistant.preprocess.load_data import *

df_symptoms = get_symptom_severity()
symptoms = df_symptoms.iloc[:, 0].values
# print(symptoms)

# convert the symptoms to labels, and the labels to symptoms
symptom_to_label = {symptom.strip(): i+1 for i, symptom in enumerate(symptoms)}
label_to_symptom = {i+1: symptom.strip() for i, symptom in enumerate(symptoms)}

print("symptom_to_label:")
print(symptom_to_label)
print("label_to_symptom:")
print(label_to_symptom)


df_disease = get_precautions()
diseases = df_disease.iloc[:, 0].values
# cluster the diseases
diseases = list(set(diseases))
# convert the diseases to labels, and the labels to diseases
disease_to_label = {disease.strip(): i+1 for i, disease in enumerate(diseases)}
label_to_disease = {i+1: disease.strip() for i, disease in enumerate(diseases)}

print(disease_to_label)
print(label_to_disease)

max_len = 1
idx = 0
df_data = get_dataset()
# iterate through the dataset and convert the symptoms and diseases to labels
for idx, row in df_data.iterrows():
    disease = row[0]
    # remove the trailing whitespace
    disease = disease.strip()
    # convert the disease to label
    label = disease_to_label[disease]
    # set the label
    row[0] = label
    # iterate through the symptoms
    for i in range(1, len(row)):
        symptom = row[i]
        if type(symptom) == float:
            row[i] = 0
            continue
        if i > max_len:
            max_len = i
            idx = idx
        # remove the trailing whitespace
        # print(symptom)
        symptom = symptom.strip()
        # convert the symptom to label
        label = symptom_to_label[symptom]
        # set the label
        row[i] = label

print(df_data.head())


In [None]:
import numpy as np

def conver_to_onhot(symptoms):
    onhot_symptoms = np.zeros(len(symptom_to_label))
    for symptom in symptoms:
        if symptom == 0:
            continue
        onhot_symptoms[symptom-1] = 1
        # print(symptom)
    return onhot_symptoms


def conver_dis_to_onhot(disease):
    onhot_disease = np.zeros(len(disease_to_label))
    onhot_disease[disease-1] = 1
    return onhot_disease

# test
# print(df_data.iloc[0, 1:].values)
# print(conver_to_onhot(df_data.iloc[0, 1:].values))

In [None]:
# a 5 layer neural network, sklearn 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# split the data into train and test
X = df_data.iloc[:, 1:].values
y = df_data.iloc[:, 0].values

X_new = []
for x in X:
    # randomly drop N symptoms
    for i in range(len(x)):
        # randomly select N symptoms from x
        xn = []
        for j in range(len(x)):
            if np.random.rand() > 0.5:
                xn.append(x[j])
        if len(xn) == 0:
          continue
        else:
          X_new.append(xn)            
            
    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0,2, random_state=42)

# convert the symptoms to onehot
X_train = np.array([conver_to_onhot(symptoms) for symptoms in X_train])
X_test = np.array([conver_to_onhot(symptoms) for symptoms in X_test])


# convert the diseases to onehot
y_train = np.array([conver_dis_to_onhot(disease) for disease in y_train])
y_test = np.array([conver_dis_to_onhot(disease) for disease in y_test])


# create the model
model = MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100), max_iter=1000, alpha=0.0001,
                     solver='adam', verbose=10,  random_state=21,tol=0.000000001, batch_size=32)

# train the model
model.fit(X_train, y_train)

# test the model
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))


In [None]:
# save the model
import pickle
filename = './model/trained/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))


In [None]:
# test 
test_sample = X_test[0]
test_sample_label = y_test[0]
print(test_sample)
print(test_sample_label)

# convert onehot to symptoms
def convert_to_symptoms(onehot_symptoms):
    symptoms = []
    for i in range(len(onehot_symptoms)):
        if onehot_symptoms[i] == 1:
            symptoms.append(label_to_symptom[i+1])
    return symptoms

# convert onehot to disease
def convert_to_disease(onehot_disease):
    for i in range(len(onehot_disease)):
        if onehot_disease[i] == 1:
            return label_to_disease[i+1]
        

print(convert_to_symptoms(test_sample))
print(convert_to_disease(test_sample_label))

# do the prediction, show the posibility of each disease
y_pred = model.predict_proba([test_sample])
print(y_pred)


In [85]:
# when some symptoms is mussed 
test_sample= [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,1,1,0,0,0,0,0,0,0]

predict = model.predict([test_sample])
print(predict)
print(convert_to_disease(predict[0]))
print(convert_to_disease(y_test[0]))

test_sample= [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,1,1,0,0,0,0,0,0,0]

predict = model.predict([test_sample])
print(predict)
print(convert_to_disease(predict[0]))
print(convert_to_disease(y_test[0]))

test_sample= [0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,0,0]
print(convert_to_symptoms(test_sample))
predict = model.predict_proba([test_sample])
print(predict)
# get top3 diseases
top3 = np.argsort(predict[0])[-10:]
print(top3)
print([convert_to_disease(conver_dis_to_onhot(label)) for label in top3])
print(convert_to_disease(y_test[0]))


[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]]
Acne
Acne
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0]]
Acne
Acne
['skin_rash']
[[7.56194219e-06 1.42070140e-08 1.16358050e-18 2.03740786e-09
  7.59171282e-22 2.32567819e-13 9.78935626e-13 3.05352365e-29
  2.80405128e-16 9.86519691e-11 8.69531482e-09 1.14975809e-07
  5.53931505e-12 1.63587075e-01 1.52891126e-19 1.06619683e-10
  8.66827434e-10 1.90693094e-16 1.16948699e-12 1.87887047e-14
  6.13794817e-22 9.80777269e-08 9.96339933e-09 1.16241622e-12
  6.53038091e-05 3.18825259e-18 1.64098460e-08 6.19849830e-17
  6.15027120e-07 1.88238886e-08 9.30160728e-18 2.16511059e-13
  7.73432469e-17 3.46189648e-17 4.69291496e-12 1.23214764e-14
  1.01510831e-05 2.26715487e-19 2.29240339e-10 2.14566846e-05
  1.29882409e-05]]
[29 21 11 28  0 36 40 39 24 13]
['Arthritis', 'hepatitis A', 'Osteoarthristis', 'GERD', 'Paralysis (brain hemorrhage)', 'Hepatitis B', 'Psoriasis', 'Mi