### Importing libs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
import pickle

### Getting the data and Randomizing

In [2]:
df = pd.read_csv("new_cvd_data.csv")
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50,0,168,62.0,110,80,1,1,0,0,1,0,18.45
1,55,1,156,85.0,140,90,3,1,0,0,1,1,27.24
2,51,1,165,64.0,130,70,3,1,0,0,0,1,19.39
3,48,0,169,82.0,150,100,1,1,0,0,1,1,24.26
4,47,1,156,56.0,100,60,1,1,0,0,0,0,17.95


In [3]:
df = df.sample(n=len(df), random_state=100)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
36199,52,1,170,65.0,120,80,1,1,0,0,1,0,19.12
60802,56,1,157,81.0,130,80,1,1,0,0,1,1,25.8
56240,45,0,172,87.0,140,90,1,1,1,1,1,0,25.29
27679,50,1,160,84.0,115,80,1,1,0,0,1,0,26.25
16663,44,0,166,99.0,140,90,1,1,0,0,0,1,29.82


### Splitting data into X and Y dataframe

In [4]:
y =  df["cardio"]
x = df.copy()
x.drop('cardio', axis=1, inplace=True)
x.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,BMI
36199,52,1,170,65.0,120,80,1,1,0,0,1,19.12
60802,56,1,157,81.0,130,80,1,1,0,0,1,25.8
56240,45,0,172,87.0,140,90,1,1,1,1,1,25.29
27679,50,1,160,84.0,115,80,1,1,0,0,1,26.25
16663,44,0,166,99.0,140,90,1,1,0,0,0,29.82


### Splitting data into train & test 

In [5]:
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=1812)


x_train = normalize(x_train)
x_test = normalize(x_test)
x = normalize(x)

### Building model

In [6]:
lr_model = LogisticRegression(random_state=0, 
                              penalty="l1", 
                              solver="liblinear", 
                              max_iter=200
                             ).fit(x_train, y_train)
lr_model.score(x_test,y_test)



0.7228744779765188

In [7]:
abc = {'age': 44, 'gender': 1, 'height': 135, 'weight': 166.0, 'ap_hi': 120, 'ap_lo': 80, 'cholestrol': 1, 'gluc': 1, 'smoke': 0, 'alco': 0, 'active': 0, 'bmi': 91.08}

In [8]:
# pickle.dump(model, open(‘model.pkl’, ‘wb’))
pickle.dump(lr_model, open('logit_model.pkl','wb'))

### Multiple Models

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

lr = LogisticRegression(random_state=0, 
                              penalty="l1", 
                              solver="liblinear", 
                              max_iter=200)
dec = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
ran = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
knn = KNeighborsClassifier(n_neighbors=5)
svm = SVC(random_state=0,kernel = 'linear')
naive = GaussianNB()

models = {"Decision tree" : dec,
          "Random forest" : ran,
          "KNN" : knn,
          "SVM" : svm,
          "Naive bayes" : naive,
         "Logistic Regression": lr}
scores= { }

for key, value in models.items():    
    model = value
    model.fit(x_train, y_train)
    scores[key] = model.score(x_test, y_test)
    
scores_frame = pd.DataFrame(scores, index=["Accuracy Score"]).T
scores_frame.sort_values(by=["Accuracy Score"], axis=0 ,ascending=False, inplace=True)
scores_frame



Unnamed: 0,Accuracy Score
Logistic Regression,0.722874
SVM,0.708691
Random forest,0.696793
Naive bayes,0.69569
KNN,0.687259
Decision tree,0.645654
