# Importing all necessary dependencies

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import cross_validation
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn import metrics



# Getting the data

In [2]:
data = pd.read_csv("/home/sakshi/Desktop/processed.cleveland.csv", names = ['age','sex','cp','BP','chol','fbs','ecg','maxhr','eiang','eist','slope','vessels','thal','diagnosis'], na_values=["?"])

In [3]:
data

Unnamed: 0,age,sex,cp,BP,chol,fbs,ecg,maxhr,eiang,eist,slope,vessels,thal,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


# Checking for the count among all columns

In [4]:
data.count()

age          303
sex          303
cp           303
BP           303
chol         303
fbs          303
ecg          303
maxhr        303
eiang        303
eist         303
slope        303
vessels      299
thal         301
diagnosis    303
dtype: int64

We can clearly see that 'vessels' and 'thal' columns have NaN values

# Getting the rows with Null values in 'vessels'

In [5]:
data[data.vessels.isnull()]

Unnamed: 0,age,sex,cp,BP,chol,fbs,ecg,maxhr,eiang,eist,slope,vessels,thal,diagnosis
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,,7.0,1
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0


# Getting the rows with Null values in 'thal'

In [6]:
data[data.thal.isnull()]

Unnamed: 0,age,sex,cp,BP,chol,fbs,ecg,maxhr,eiang,eist,slope,vessels,thal,diagnosis
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,,0
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,,2


Now that we know that we have Null values in thal and vessels columns so, we can replace these values with their mean values in their respective classes

# Getting the classes with missing values

In [7]:
data.groupby('diagnosis').count()

Unnamed: 0_level_0,age,sex,cp,BP,chol,fbs,ecg,maxhr,eiang,eist,slope,vessels,thal
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,164,164,164,164,164,164,164,164,164,164,164,161,163
1,55,55,55,55,55,55,55,55,55,55,55,54,55
2,36,36,36,36,36,36,36,36,36,36,36,36,35
3,35,35,35,35,35,35,35,35,35,35,35,35,35
4,13,13,13,13,13,13,13,13,13,13,13,13,13


We can see we have 3 missing values of 'vessels' in class0 and 1 in class1, similarly 'thal' has 1 missing value in class0 and 1 in class2. We will replace them with their means in respective classes.

In [8]:
# Considering class0, class1, class2 only, as only these classes have NaN values
class0 = data[data.diagnosis==0]
class1 = data[data.diagnosis==1]
class2 = data[data.diagnosis==2]

# Handling NaN values in 'vessels' columns

In [9]:
class0Mean_vessels = data[data.diagnosis==0].vessels.mean()
class0Mean_vessels

0.2732919254658385

In [10]:
class1Mean_vessels = data[data.diagnosis==1].vessels.mean()
class1Mean_vessels

0.7407407407407407

We see for class0 we have mean as 0.268293 and for class1 it is 0.727273, we can get their rounded off values as vessels stores the record for number of major vessels colored by fluoroscopy whose value lies between [0, 3]

In [11]:
vesseslsVal0_vessels = round(class0Mean_vessels)
vesseslsVal0_vessels

0.0

In [12]:
vesseslsVal1_vessels = round(class1Mean_vessels)
vesseslsVal1_vessels

1.0

We get rounded values as 0 and 1 for class0 and class1 respectively, now we got to update our dataset with these values

In [13]:
# Updating class0 and class1 with the new rounded off mean calculated
class0.vessels.fillna(vesseslsVal0_vessels, axis=0, inplace=True)
class1.vessels.fillna(vesseslsVal1_vessels, axis=0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


# Handling NaN values in 'thal' columns

In [14]:
class0Mean_thal = data[data.diagnosis==0].thal.mean()
class0Mean_thal

3.7975460122699385

In [15]:
class2Mean_thal = data[data.diagnosis==2].thal.mean()
class2Mean_thal

6.0285714285714285

We see for class0 we have mean as 3.797546 and for class2 it is 6.028571, we can get their rounded off values, as  variable "thal" refers to a Thalium Stress Test and is represented by the integers 3, 6, and 7(3=Normal; 6=Fixed Defect; and 7=Reversible Defect).

In [16]:
thalVal0_thal = int(class0Mean_thal)
thalVal0_thal

3

In [17]:
thalVal2_thal = round(class2Mean_thal)
thalVal2_thal

6.0

We get rounded values as 3 and 6 for class0 and class2 respectively, now we got to update our dataset with these values

In [18]:
# Updating class0 and class1 with the new rounded off mean calculated
class0.thal.fillna(thalVal0_thal, axis=0, inplace=True)
class2.thal.fillna(thalVal2_thal, axis=0, inplace=True)

# Removing classes with NaN values from dataset

In [19]:
data = data[data.diagnosis != 0] 
data = data[data.diagnosis != 1] 
data = data[data.diagnosis != 2]

# Changing the target values that are 2, 3 or 4 to 1
data.diagnosis[data.diagnosis==2] = 1
data.diagnosis[data.diagnosis==3] = 1
data.diagnosis[data.diagnosis==4] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# Adding processed classes values to our dataset

In [20]:
data = pd.concat([class0, class1, class2, data])

In [21]:
data

Unnamed: 0,age,sex,cp,BP,chol,fbs,ecg,maxhr,eiang,eist,slope,vessels,thal,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
10,57.0,1.0,4.0,140.0,192.0,0.0,0.0,148.0,0.0,0.4,2.0,0.0,6.0,0
11,56.0,0.0,2.0,140.0,294.0,0.0,2.0,153.0,0.0,1.3,2.0,0.0,3.0,0
13,44.0,1.0,2.0,120.0,263.0,0.0,0.0,173.0,0.0,0.0,1.0,0.0,7.0,0
14,52.0,1.0,3.0,172.0,199.0,1.0,0.0,162.0,0.0,0.5,1.0,0.0,7.0,0
15,57.0,1.0,3.0,150.0,168.0,0.0,0.0,174.0,0.0,1.6,1.0,0.0,3.0,0


In [22]:
# Sorting the data by it's index
data = data.sort_index()

# Dividing data into training and testing portions

In [23]:
#from sklearn.cross_validation import train_test_split
x_data = data[['age', 'sex', 'cp', 'BP', 'chol', 'fbs', 'ecg', 'maxhr', 'eiang', 'eist',  'slope', 'vessels', 'thal']]
y_data = data['diagnosis']

x_train, x_test, y_train, y_test = cross_validation.train_test_split(x_data, y_data, test_size=0.2)
x_train.shape, x_test.shape

((242, 13), (61, 13))

# Creating the classifier

In [24]:
#Normalizing the data
min_max_scaler = preprocessing.MinMaxScaler().fit(x_train)
x_train_norm = min_max_scaler.transform(x_train)
x_test_norm = min_max_scaler.transform(x_test)

In [25]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [26]:
clf.fit(x_train_norm, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [27]:
y = clf.predict(x_test_norm)

In [28]:
metrics.accuracy_score(y_test, y)

0.72131147540983609