In [1]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator, ParameterEstimator, BicScore, ConstraintBasedEstimator
from pgmpy.inference import VariableElimination
from pgmpy.independencies import Independencies
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import Image

In [2]:

'''
Attribute Information:
   -- Only 14 used
      -- 1. #3  (age), age in years
      -- 2. #4  (sex), sex (1 = male; 0 = female)
      -- 3. #9  (cp), chest pain type
        -- Value 1: typical angina
        -- Value 2: atypical angina
        -- Value 3: non-anginal pain
        -- Value 4: asymptomatic
      -- 4. #10 (trestbps), resting blood pressure (in mm Hg on admission to the hospital)
      -- 5. #12 (chol), serum cholestoral in mg/dl
      -- 6. #16 (fbs), (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false)
      -- 7. #19 (restecg), restecg: resting electrocardiographic results
        -- Value 0: normal
        -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 
                    elevation or depression of > 0.05 mV)
        -- Value 2: showing probable or definite left ventricular hypertrophy
                    by Estes' criteria
      -- 8. #32 (thalach), maximum heart rate achieved
      -- 9. #38 (exang), exercise induced angina (1 = yes; 0 = no)
      -- 10. #40 (oldpeak), = ST depression induced by exercise relative to rest
      -- 11. #41 (slope), the slope of the peak exercise ST segment
        -- Value 1: upsloping
        -- Value 2: flat
        -- Value 3: downsloping   
      -- 12. #44 (ca), number of major vessels (0-3) colored by flourosopy
      -- 13. #51 (thal), 3 = normal; 6 = fixed defect; 7 = reversable defect 
      -- 14. #58 (num), (the predicted attribute) num: diagnosis of heart disease (angiographic disease status)
        -- Value 0: < 50% diameter narrowing
        -- Value 1: > 50% diameter narrowing
        (in any major vessel: attributes 59 through 68 are vessels)



10. Class Distribution:
        Database:      0   1   2   3   4 Total
          Cleveland: 164  55  36  35  13   303
          Hungarian: 188  37  26  28  15   294
        Switzerland:   8  48  32  30   5   123
      Long Beach VA:  51  56  41  42  10   200


TODO: * Check if there is a -9 in the dataset because it is stated that this would be the marker for a missing value.
      * Check for occurences of ?
'''



column_names = [
    'age',
    'sex',
    'cp',
    'trestbps',
    'chol',
    'fbs',
    'restecg',
    'thalach',
    'exang',
    'oldpeak',
    'slope',
    'ca',
    'thal', 
    'num'
]

df = pd.read_csv("data/processed.cleveland.data", header = None, names = column_names)

# Removing non-numeric values
for column in column_names:
    # pandas.to_numeric return type depends on input. Series if Series, otherwise ndarray
    # If ‘coerce’, then invalid parsing will be set as NaN
    df = df[pd.to_numeric(df[column], errors='coerce').notnull()]

df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


In [3]:
for i in range(0,len(df)):
    if not i==87 and not i==166 and not i==192 and not i==266 and not i==287:
        if df.loc[i, 'num'] >1:
            df.loc[i, 'num']=1
df.loc[299, 'num']=1
df.loc[300, 'num']=1

df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,1
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,1
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


In [4]:
counter0=0
counter1=0
for i in range(0,len(df)):
    if not i==87 and not i==166 and not i==192 and not i==266 and not i==287:
        if df.loc[i, 'num'] == 1:
            counter1+=1
        elif df.loc[i, 'num'] == 0:
            counter0+=1

if df.loc[299, 'num'] == 1:
    counter1+=1
elif df.loc[299, 'num'] == 0:
    counter0+=1
    
if df.loc[300, 'num'] == 1:
    counter1+=1
elif df.loc[300, 'num'] == 0:
    counter0+=1

if df.loc[301, 'num'] == 1:
    counter1+=1
elif df.loc[301, 'num'] == 0:
    counter0+=1
print "counter0 = "+ str(counter0)
print "counter1 = "+ str(counter1)

counter0 = 160
counter1 = 135


In [5]:
print "min age = " + str(min(df.iloc[:,0]))
print "max age = "+ str(max(df.iloc[:,0]))
print "min restbps = " + str(min(df.iloc[:,3]))
print "max trestbps = " + str(max(df.iloc[:,3]))
print "min chol = " + str(min(df.iloc[:,4]))
print "max chol = "+ str(max(df.iloc[:,4]))
print "min thalach = " + str(min(df.iloc[:,7]))
print "max thalach = "+str(max(df.iloc[:,7]))
print "min oldpeak = " + str(min(df.iloc[:,9]))
print "max oldpeak = "+str(max(df.iloc[:,9]))

min age = 29.0
max age = 77.0
min restbps = 94.0
max trestbps = 200.0
min chol = 126.0
max chol = 564.0
min thalach = 71.0
max thalach = 202.0
min oldpeak = 0.0
max oldpeak = 6.2


In [6]:
#age
out, bins =pd.qcut(df.iloc[:, 0], 4, labels=["Age: (29,48]", "Age: (48,56]","Age: (56,61]", "Age: (61,77]"], retbins= True)
df.iloc[:, 0] = out
#trestbps
out, bins = pd.qcut(df.iloc[:, 3], 4, labels=["trestbps: (94,120]", "trestbps: (120,130]","trestbps:(130,140]", "trestbsp: (140,200]"], retbins= True)
df.iloc[:, 3] = out
#chol
out, bins = pd.qcut(df.iloc[:, 4], 4, labels=["chol: (126,211]", "chol: (211,243]","chol: (243,276]", "chol: (276,564]"], retbins= True)
df.iloc[:, 4] = out
#thalach
out, bins = pd.qcut(df.iloc[:, 7], 4, labels=["thalach: (71,133]", "thalach: (133,153]","thalach: (153,166]", "thalach: (166,202]"], retbins= True)
df.iloc[:, 7] = out
#oldpeak
out, bins = pd.qcut(df.iloc[:, 9], 3, labels=["oldpeak: (0,0.1]", "oldpeak: (0.1,1.4]","oldpeak: (1.4,6.2]"], retbins= True)
df.iloc[:, 9] = out

In [7]:
#conversion check
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,"Age: (61,77]",1.0,1.0,"trestbsp: (140,200]","chol: (211,243]",1.0,2.0,"thalach: (133,153]",0.0,"oldpeak: (1.4,6.2]",3.0,0.0,6.0,0
1,"Age: (61,77]",1.0,4.0,"trestbsp: (140,200]","chol: (276,564]",0.0,2.0,"thalach: (71,133]",1.0,"oldpeak: (1.4,6.2]",2.0,3.0,3.0,1
2,"Age: (61,77]",1.0,4.0,"trestbps: (94,120]","chol: (211,243]",0.0,2.0,"thalach: (71,133]",1.0,"oldpeak: (1.4,6.2]",2.0,2.0,7.0,1
3,"Age: (29,48]",1.0,3.0,"trestbps: (120,130]","chol: (243,276]",0.0,0.0,"thalach: (166,202]",0.0,"oldpeak: (1.4,6.2]",3.0,0.0,3.0,0
4,"Age: (29,48]",0.0,2.0,"trestbps: (120,130]","chol: (126,211]",0.0,2.0,"thalach: (166,202]",0.0,"oldpeak: (0.1,1.4]",1.0,0.0,3.0,0
5,"Age: (48,56]",1.0,2.0,"trestbps: (94,120]","chol: (211,243]",0.0,0.0,"thalach: (166,202]",0.0,"oldpeak: (0.1,1.4]",1.0,0.0,3.0,0
6,"Age: (61,77]",0.0,4.0,"trestbps:(130,140]","chol: (243,276]",0.0,2.0,"thalach: (153,166]",0.0,"oldpeak: (1.4,6.2]",3.0,2.0,3.0,1
7,"Age: (56,61]",0.0,4.0,"trestbps: (94,120]","chol: (276,564]",0.0,0.0,"thalach: (153,166]",1.0,"oldpeak: (0.1,1.4]",1.0,0.0,3.0,0
8,"Age: (61,77]",1.0,4.0,"trestbps: (120,130]","chol: (243,276]",0.0,2.0,"thalach: (133,153]",0.0,"oldpeak: (0.1,1.4]",2.0,1.0,7.0,1
9,"Age: (48,56]",1.0,4.0,"trestbps:(130,140]","chol: (126,211]",1.0,2.0,"thalach: (153,166]",1.0,"oldpeak: (1.4,6.2]",3.0,0.0,7.0,1


In [8]:
#Model 1 is a 4 layer structure and the initial model
model1 = BayesianModel([('age', 'cp'),('age', 'trestbps'),('age', 'chol'), ('age', 'fbs'),('sex', 'cp') ,('sex', 'trestbps'), ('sex', 'chol'), ('sex','fbs'), ('cp', 'thalach'), ('cp', 'exang'), ('cp', 'oldpeak'), ('cp', 'slope'), ('cp', 'ca'),('cp', 'thal'),('cp', 'restecg'),('trestbps', 'restecg'),('trestbps', 'thalach'), ('trestbps','exang'), ('trestbps', 'oldpeak'), ('trestbps', 'slope'), ('trestbps', 'ca'), ('trestbps', 'thal'), ('chol', 'thalach'), ('chol', 'exang'), ('chol', 'oldpeak'), ('chol', 'slope'), ('chol', 'ca'), ('chol', 'thal'),('chol', 'restecg') ,('fbs', 'restecg'),('fbs', 'thalach'), ('fbs', 'exang'), ('fbs', 'oldpeak'), ('fbs', 'slope'), ('fbs', 'ca'), ('fbs', 'thal'),('thalach', 'num'), ('exang', 'num'), ('oldpeak', 'num'), ('slope', 'num'), ('ca', 'num'), ('thal', 'num'), ('restecg', 'num')])

model_from_hillclimbing = BayesianModel([('slope', 'oldpeak'), ('slope', 'thalach'), ('num', 'slope'), ('num', 'ca'), ('num', 'age'), ('num', 'thal'), ('cp', 'num'), ('cp', 'exang'), ('thal', 'sex')])

In [9]:
est = ConstraintBasedEstimator(df)
skel, sep_sets = est.estimate_skeleton()

TypeError: object of type 'dictionary-keyiterator' has no len()