In [2]:
# Import all the tools we need

# Regular EDA (Exploratory data analysis) and ploting libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [7]:
df = pd.read_csv("heart.csv")

In [8]:
# Split data into X and y
X = df.drop("target",axis=1)

y = df["target"]

In [9]:
# Split data into train and test sets
np.random.seed(42)

# Split into train and test set
X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.2)

In [10]:
# Scaling the data

from sklearn.preprocessing import StandardScaler

scalar = StandardScaler().fit(X_train)

X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [11]:
# Puts model into a dictionary

models = {"Logistic Regression": LogisticRegression(),
          "KNN":KNeighborsClassifier(), 
          "Random Forest":RandomForestClassifier()}

# Create a function to fit and score models

def fit_and_score(models, X_train, X_test, y_train, y_test):
    # Set random seed
    np.random.seed(42)
    # Make a dictionary to keep model scores
    model_scores = {}
    for name, model in models.items():
        # Fit the model to data
        model.fit(X_train,y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test,y_test)
    return model_scores

In [12]:
model_scores = fit_and_score(models,X_train,X_test,y_train,y_test)
model_scores

{'Logistic Regression': 0.7951219512195122,
 'KNN': 0.8341463414634146,
 'Random Forest': 0.9853658536585366}

In [13]:
# Let's tune KNN

train_scores = []
test_scores = []

# Create a list of different values for n_neighbors
neighbors = range(1,21)

# Setup KNN instance
knn = KNeighborsClassifier()

# Loop thorugh different n_neighbors

for i in neighbors:
    knn.set_params(n_neighbors=i)
    
    # Fit the algorithm
    knn.fit(X_train, y_train)
    
    # Update the training scores list
    train_scores.append(knn.score(X_train,y_train))
    
    # Update the testing scores list
    test_scores.append(knn.score(X_test,y_test))


In [14]:
knn.set_params(n_neighbors = np.argmax(test_scores)+1 )
knn.fit(X_train, y_train)

In [17]:
y_preds = knn.predict(X_test)

In [18]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.97      0.99       103

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [22]:
knn.score(X_test,y_test)

0.9853658536585366

In [19]:
# Cross-validated accuaracy
cv_acc = cross_val_score(knn,
                        X,
                        y,
                        cv=5,
                        scoring="accuracy")
cv_acc = np.mean(cv_acc)

In [20]:
# Cross-validated precision
cv_precision = cross_val_score(knn,
                        X,
                        y,
                        cv=5,
                        scoring="precision")
cv_precision = np.mean(cv_precision)

In [21]:
# Cross-validated recall
cv_recall = cross_val_score(knn,
                        X,
                        y,
                        cv=5,
                        scoring="recall")
cv_recall = np.mean(cv_recall)

In [22]:
# Cross-validated f1-score
cv_f1 = cross_val_score(knn,
                        X,
                        y,
                        cv=5,
                        scoring="f1")
cv_f1 = np.mean(cv_f1)

In [23]:
# Fit an instance of LogisticRegression

clf = LogisticRegression(C = 0.20433597178569418, 
                         solver = 'liblinear')
clf.fit(X_train,y_train);

In [24]:
# Check Coef_
clf.coef_

array([[-0.02804057, -0.7582299 ,  0.81995224, -0.2878802 , -0.39242627,
        -0.05166252,  0.132902  ,  0.59462252, -0.41030036, -0.70969781,
         0.32732507, -0.78450042, -0.61866499]])

In [25]:
# Match coef's of features to coloumns

feature_dict = dict(zip(df.columns,list(clf.coef_[0])))
feature_dict

{'age': -0.028040567757253672,
 'sex': -0.7582298959063809,
 'cp': 0.819952243370463,
 'trestbps': -0.28788020076644266,
 'chol': -0.3924262743547424,
 'fbs': -0.051662521587816146,
 'restecg': 0.13290200370443309,
 'thalach': 0.5946225238370315,
 'exang': -0.4103003642159617,
 'oldpeak': -0.709697813142871,
 'slope': 0.32732507374660147,
 'ca': -0.7845004209027502,
 'thal': -0.6186649893680723}

In [26]:
pd.crosstab(df.sex,df.target)

target,0,1
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
0,86,226
1,413,300


In [27]:
# Save the model
model_filename = 'model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(knn, file)
print('Model Saved Succesfully!')

Model Saved Succesfully!
