https://www.kaggle.com/sulianova/cardiovascular-disease-dataset/download

Features:

Age | Objective Feature | age | int (days)

Height | Objective Feature | height | int (cm) |

Weight | Objective Feature | weight | float (kg) |

Gender | Objective Feature | gender | categorical code |

Systolic blood pressure | Examination Feature | ap_hi | int |

Diastolic blood pressure | Examination Feature | ap_lo | int |

Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |

Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |

Smoking | Subjective Feature | smoke | binary |

Alcohol intake | Subjective Feature | alco | binary |

Physical activity | Subjective Feature | active | binary |

Presence or absence of cardiovascular disease | Target Variable | cardio | binary |


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.dummy import DummyClassifier        
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
# read the data
df = pd.read_excel("Desktop/Python_ML/cardio.xlsx")
df.sample(3)

In [None]:
#check the number of rows and columns
df.shape

In [None]:
#find out what are the datatypes, do we have null values, what is the memory usage
df.info()

In [None]:
#optimize memory usage
df['smoke'] = df['smoke'].astype('int8')
df['alco'] = df['alco'].astype('int8')
df['active'] = df['active'].astype('int8')
df['cardio'] = df['cardio'].astype('int8')
df['cholesterol'] = df['cholesterol'].astype('int8')
df['gluc'] = df['gluc'].astype('int8')
df.info()

In [None]:
#create a new variable
df['age_years'] = df['age']/365
df['age_years'] = df['age_years'].round(decimals=2)
df.sample(5)

In [None]:
#set an id column as an index
df.set_index('id',inplace=True)

In [None]:
#rename columns
df.columns = ['age_days', 'gender', 'height', 'weight', 'pressure_sys', 'pressure_dias', 'cholesterol',
             'glucose', 'smoking', 'alcohol', 'active', 'cardio_target', 'age_years'] 
df.head()

In [None]:
#check if there are any strange values in pressure columns
pressure_neg = df [ (df['pressure_sys'] < 0) | (df['pressure_dias'] < 0) ]
len(pressure_neg.index)

In [None]:
#delete observations with unrealistic pressure values
df.drop(df[(df['pressure_sys'] < 0) | (df['pressure_dias'] < 0)].index, inplace=True)
df.shape

In [None]:
#check if there are categorical variables which need for example encoding
cat_feats = df.select_dtypes(include=[np.object]).columns
cat_feats

In [None]:
#correlation - looking for patterns, inspo for feature engineering, plots
plt.rcParams['figure.figsize']=(20,10)
sns.heatmap(df.corr(), vmax=1., vmin=-1., annot=True, linewidths=.8, cmap="Greens");

In [None]:
#istogram for age feature
df['age_years'].plot(kind='hist', figsize=(8, 5))

plt.title('Age distribution')
plt.ylabel('Number of patients') 
plt.xlabel('Age')

plt.show()

In [None]:
#create bins for easier interpretation
def height_bins(row):
    if row['height'] > 0 and row['height']   <=  90: return '0-90'
    if row['height'] > 90 and row['height']  <= 125: return '90-80'
    if row['height'] > 125 and row['height'] <= 160: return '125-160'
    if row['height'] > 160 and row['height'] <= 195: return '160-195'
    if row['height'] > 195 and row['height'] <= 230: return '195-230'
    return '230-250'

df['height_bins'] = df [ ['height']].apply(height_bins, axis = 1)

df.sample(3)

In [None]:
# plotting height bins and gender (quite strong corr)
plt.figure(figsize=(15, 5))
sns.barplot(x='height_bins', y="cardio_target", hue='gender', data=df, color = 'green')
plt.xticks(rotation=90);

In [None]:
# plotting glucose and cholesterol results (quite strong corr)
plt.figure(figsize=(15, 5))
sns.barplot(x="glucose", y="cardio_target", hue='cholesterol', data=df, color = 'green')
plt.xticks(rotation=90);

In [None]:
#find a mean value for observations grouped by choosen features
df.groupby('height_bins')['alcohol', 'glucose', 'pressure_sys', 'pressure_dias'].mean().round(decimals=2)

In [None]:
#define function in order to reuse it when building different models
def train_and_predict_model(X_train, X_test, y_train, y_test, model, success_metric=accuracy_score):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print("Distribution:")
    print( pd.Series(y_pred).value_counts() )
    
    return success_metric(y_test, y_pred)

In [None]:
#define model input and split observation into train and test
X = df[['pressure_sys', 'active', 'weight','smoking', 'cholesterol', 'gender']].values
y = df['cardio_target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

In [None]:
#first model
train_and_predict_model(X_train, X_test, y_train, y_test, DummyClassifier())

In [None]:
#second model
train_and_predict_model(X_train, X_test, y_train, y_test, LogisticRegression(solver='lbfgs', max_iter=300))

In [None]:
#third model
train_and_predict_model(X_train, X_test, y_train, y_test, DecisionTreeClassifier())