## Spara Model 

In [47]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from utils import classification_evaluation
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
import random
import numpy as np
import joblib

In [36]:
# Pick out 100 random rows from dataset

dataframe = pd.read_csv("../data/cleaned_dataset_2.csv")

In [37]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45304 entries, 0 to 45303
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            45304 non-null  int64  
 1   age           45304 non-null  int64  
 2   ap_hi         45304 non-null  int64  
 3   ap_lo         45304 non-null  int64  
 4   cholesterol   45304 non-null  int64  
 5   gluc          45304 non-null  int64  
 6   smoke         45304 non-null  int64  
 7   alco          45304 non-null  int64  
 8   active        45304 non-null  int64  
 9   cardio        45304 non-null  int64  
 10  age_years     45304 non-null  int64  
 11  bmi           45304 non-null  float64
 12  gender_women  45304 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 4.5 MB


In [38]:
num_rows = dataframe.shape[0]

random_rows = np.random.choice(num_rows, 100, replace=False)
test_samples = dataframe.iloc[random_rows]

# drop the selected rows from the original dataframe
dataframe = dataframe.drop(random_rows)

In [39]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45204 entries, 0 to 45303
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            45204 non-null  int64  
 1   age           45204 non-null  int64  
 2   ap_hi         45204 non-null  int64  
 3   ap_lo         45204 non-null  int64  
 4   cholesterol   45204 non-null  int64  
 5   gluc          45204 non-null  int64  
 6   smoke         45204 non-null  int64  
 7   alco          45204 non-null  int64  
 8   active        45204 non-null  int64  
 9   cardio        45204 non-null  int64  
 10  age_years     45204 non-null  int64  
 11  bmi           45204 non-null  float64
 12  gender_women  45204 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 4.8 MB


In [40]:
test_samples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 42748 to 4861
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            100 non-null    int64  
 1   age           100 non-null    int64  
 2   ap_hi         100 non-null    int64  
 3   ap_lo         100 non-null    int64  
 4   cholesterol   100 non-null    int64  
 5   gluc          100 non-null    int64  
 6   smoke         100 non-null    int64  
 7   alco          100 non-null    int64  
 8   active        100 non-null    int64  
 9   cardio        100 non-null    int64  
 10  age_years     100 non-null    int64  
 11  bmi           100 non-null    float64
 12  gender_women  100 non-null    int64  
dtypes: float64(1), int64(12)
memory usage: 10.9 KB


In [41]:
# Commented out to avoid saving the file over and over.
# test_samples.to_csv('../data/test_samples.csv', index=False)

----

## Train dataset on best model

- All my chosen models seemed to be arrive at approximately thesame result. 
- They all returned 72 or 73 percent accuracy, about 79 percent on recall and around 72 percent on precision.
- I believe further research and hyperparameter tuning could improve the results as well domain knowledge on the dataset itself for better data processing.
- Due to time constraints, the results seem to be much better than the ones I had before.
- That being said, Logistic Regression had a 1% advantage over the other models when I used StandardScaler to scale my data.
- So I will continue to use Logistic Regression for this project.

In [43]:
# Define my standard scaler
scaler = StandardScaler()

# initialize the my model with the best hyperparameters
regression_model = LogisticRegression(
    class_weight="balanced",
    l1_ratio=0.6326530612244897,
    max_iter=10000,
    multi_class="ovr",
    penalty="elasticnet",
    solver="saga",
)


In [45]:
# Split dataset

X, y = dataframe.drop('cardio', axis=1), dataframe['cardio']
X.shape, y.shape

((45204, 12), (45204,))

In [46]:
# Train dataset

regression_model.fit(X,y)

In [49]:
# Save the model using joblib.dump()

joblib.dump(regression_model, '../models/model.pkl', compress=True)

['../models/model.pkl']