## Disease Prediction from Symptoms

Dataset Source: Raw data from [here](http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html) 

In [1]:
# Import Dependencies
import csv
import xlrd
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_excel('./data/raw_data.xlsx')

In [3]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0011847_diabetes,1421.0,UMLS:C0032617_polyuria
1,,,UMLS:C0085602_polydypsia
2,,,UMLS:C0392680_shortness of breath
3,,,UMLS:C0008031_pain chest


In [4]:
data = df.fillna(method='ffill')

In [5]:
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0011847_diabetes,1421.0,UMLS:C0032617_polyuria
1,UMLS:C0011847_diabetes,1421.0,UMLS:C0085602_polydypsia
2,UMLS:C0011847_diabetes,1421.0,UMLS:C0392680_shortness of breath
3,UMLS:C0011847_diabetes,1421.0,UMLS:C0008031_pain chest


In [6]:
list(data)

['Disease', 'Count of Disease Occurrence', 'Symptom']

In [7]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [8]:
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [9]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest']})

In [10]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'diabetes': 1421.0}

In [11]:
df1 = pd.DataFrame(list(disease_symptom_dict.items()), columns=['Disease','Symptom'])

In [12]:
df1.head()

Unnamed: 0,Disease,Symptom
0,diabetes,"[polyuria, polydypsia, shortness of breath, pa..."


In [13]:
disease = df1.iloc[0,:]


In [14]:
sym1 = df1.iloc[0,:]
RealDisease = "You having the symptoms of "+disease[0]+""
IntermediateDisease = "You might have some symptoms on diabetics"
NoDisease = "You don't have any symptoms on diabetics"

In [17]:
def ask_question(qn):
    resp = input(qn)
    if resp.lower() in ["yes", "y"]: # Handles Yes, yes etc.
        return True
    else:
        return False

responses = {(True, True, True, True):RealDisease,
             (True, False, False, False):IntermediateDisease,
             (True, True, False, False):IntermediateDisease,
             (True, True, True, False):IntermediateDisease,
             
             (False, True, True, True):IntermediateDisease,
             (False, False, True, True):IntermediateDisease,
             (False, False, False, True):IntermediateDisease,
             
             (True, False, False, False):IntermediateDisease,
             (False, True, False, False):IntermediateDisease,
             (False, False, True, False):IntermediateDisease,
             (False, False, False, True):IntermediateDisease,
             
             (False, True, True, True):IntermediateDisease,
             (True, False, True, True):IntermediateDisease,
             (True, True, False, True):IntermediateDisease,
             (True, True, True, False):IntermediateDisease,
             
             (False, False, False, False):NoDisease
            }

In [None]:
answers = []
lst = sym1[1]
for q in lst:
    questions = "Do you feel "+q+" "
    answers.append(ask_question(questions))

In [None]:
print (responses[tuple(answers)])