In [3]:
# Dataset from http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df = pd.read_excel('raw_data.xlsx')

In [5]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [6]:
data = df.fillna(method='ffill')


In [7]:
data.head()


Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [9]:
list(data)


['Disease', 'Count of Disease Occurrence', 'Symptom']

In [10]:
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [11]:
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

In [12]:
for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [13]:
disease_symptom_dict


defaultdict(list,
            {"Alzheimer's disease": ['drool',
              'agitation',
              'nightmare',
              'rhonchus',
              'consciousness clear',
              'pin-point pupils',
              'bedridden',
              'bedridden',
              'frail',
              'tremor resting',
              'hyperkalemia',
              'facial paresis',
              'groggy',
              'muscle twitch',
              'wheelchair bound',
              'tremor',
              'cough',
              'fever'],
             'HIV': ['fever',
              'night sweat',
              'spontaneous rupture of membranes',
              'cough',
              '',
              'decreased body weight',
              'chill',
              'diarrhea',
              'pleuritic pain',
              'patient non compliance',
              'tachypnea',
              'productive cough',
              'muscle hypotonia',
              'hypotonic',
              'feeling

In [14]:
disease_symptom_count


{"Alzheimer's disease": 101.0,
 'HIV': 350.0,
 'Pneumocystis\xa0carinii\xa0pneumonia': 113.0,
 'accident\xa0cerebrovascular': 885.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'adenocarcinoma': 166.0,
 'adhesion': 57.0,
 'affect labile': 45.0,
 'anemia': 544.0,
 'anxiety state': 390.0,
 'aphasia': 76.0,
 'arthritis': 179.0,
 'asthma': 835.0,
 'bacteremia': 142.0,
 'benign prostatic hypertrophy': 192.0,
 'biliary calculus': 61.0,
 'bipolar disorder': 241.0,
 'bronchitis': 172.0,
 'candidiasis': 99.0,
 'carcinoma': 269.0,
 'carcinoma breast': 152.0,
 'carcinoma colon': 94.0,
 'carcinoma of lung': 86.0,
 'carcinoma prostate': 163.0,
 'cardiomyopathy': 283.0,
 'cellulitis': 341.0,
 'cholecystitis': 66.0,
 'cholelithiasis': 61.0,
 'chronic alcoholic intoxication': 70.0,
 'chronic kidney failure': 280.0,
 'chronic obstructive airway disease': 524.0,
 'cirrhosis': 218.0,
 'colitis': 114.0,
 'confusion': 408.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 '

In [15]:
df1 = pd.DataFrame(list(disease_symptom_dict.items()), columns=['Disease','Symptom'])


In [16]:
df1.head()


Unnamed: 0,Disease,Symptom
0,failure heart congestive,"[shortness of breath, orthopnea, jugular venou..."
1,anxiety state,"[worry, feeling suicidal, suicidal, sleeplessn..."
2,incontinence,"[paraparesis, seizure, asthenia, urge incontin..."
3,emphysema pulmonary,"[behavior showing increased motor activity, sc..."
4,melanoma,"[mass of body structure, paraparesis, fever, g..."


In [17]:
for vals in disease_symptom_count.items():
    print(vals[1])

963.0
390.0
165.0
80.0
87.0
76.0
325.0
1337.0
297.0
147.0
82.0
350.0
354.0
66.0
290.0
68.0
57.0
92.0
56.0
354.0
445.0
140.0
103.0
92.0
93.0
1284.0
186.0
269.0
94.0
61.0
114.0
192.0
3363.0
68.0
67.0
1029.0
114.0
114.0
268.0
71.0
145.0
94.0
171.0
241.0
544.0
86.0
113.0
99.0
99.0
144.0
350.0
1284.0
163.0
105.0
68.0
95.0
135.0
119.0
140.0
226.0
169.0
208.0
122.0
76.0
104.0
283.0
311.0
67.0
164.0
133.0
126.0
166.0
341.0
408.0
163.0
128.0
138.0
630.0
218.0
68.0
123.0
140.0
294.0
186.0
61.0
165.0
179.0
311.0
108.0
71.0
101.0
90.0
228.0
87.0
68.0
56.0
84.0
172.0
96.0
759.0
597.0
61.0
160.0
685.0
524.0
1337.0
504.0
99.0
124.0
61.0
269.0
280.0
90.0
165.0
885.0
161.0
247.0
310.0
350.0
96.0
86.0
70.0
142.0
835.0
158.0
267.0
168.0
74.0
398.0
1421.0
42.0
152.0
405.0
101.0
165.0
76.0
311.0
56.0
297.0
45.0
143.0
80.0
152.0
71.0
111.0
85.0
142.0
94.0
61.0


In [18]:
df1.head()


Unnamed: 0,Disease,Symptom
0,failure heart congestive,"[shortness of breath, orthopnea, jugular venou..."
1,anxiety state,"[worry, feeling suicidal, suicidal, sleeplessn..."
2,incontinence,"[paraparesis, seizure, asthenia, urge incontin..."
3,emphysema pulmonary,"[behavior showing increased motor activity, sc..."
4,melanoma,"[mass of body structure, paraparesis, fever, g..."
