In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [7]:
df = pd.read_csv("../Labb/Disease_prediction/cardio_train.csv", sep=";")

df["age"] = round(df["age"]/365).astype(int) 

df['BMI'] = df['weight'] / (df['height']/100)**2

# remove outliers that are below 15 and above 50
df = df[df['BMI'] > 15]
df = df[df['BMI'] < 50]

df['BMI'].min(), df['BMI'].max()

df['BMI_category'] = df['BMI'].apply(lambda x: 1 if x < 25 else 2 if x < 30 else 3)

# removing outliers
# set the limits for systolic blood pressure to 90-200 and for diastolic blood pressure to 60-145
# From what I can find, you have hypotension (low blood pressure) if you go below 90/60.
df = df[df['ap_hi'] > 90]
df = df[df['ap_hi'] < 200] # set it to 200 since the next highest recorded value in the dataset is 197 for systolic blood pressure
# and systolic pressure above 180 is potentially life-threatening, which means not alot of people will have a systolic blood pressure above 200

# diastolic blood pressure limits
# From what I can find, you have hypotension (low blood pressure) if you go below 60 diastolic blood pressure.
df = df[df['ap_lo'] > 60]
df = df[df['ap_lo'] < 145] # set the limit to 145 since the highest recorded value in the dataset is 140 for diastolic blood pressure


df = df[df['ap_hi'] > df['ap_lo']] # removes all rows where the diastolic blood pressure is higher than the systolic blood pressure

for index, row in df.iterrows():
    sys_bp = row['ap_hi']
    dia_bp = row['ap_lo']

    new_col = 'BP_category'

    # Categorize the blood pressure according to the standard guidelines from wikipedia
    if sys_bp < 120 and dia_bp < 80:
        df.at[index, new_col] = 1
    elif sys_bp < 130 and dia_bp < 80:
        df.at[index, new_col] = 2
    elif sys_bp < 140 or dia_bp < 90:
        df.at[index, new_col] = 3
    elif sys_bp < 180 or dia_bp < 120:
        df.at[index, new_col] = 4
    elif sys_bp > 180 or dia_bp > 120:
        df.at[index, new_col] = 5

df_1 = df.drop(['ap_hi', 'ap_lo', 'height', 'weight', 'BMI'], axis=1)
df_2 = df.drop(['height', 'weight', 'BMI_category', 'BP_category'], axis=1)

In [9]:
df_2.drop('id', axis=1, inplace=True)

In [28]:
df_2

Unnamed: 0,age,gender,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,50,2,110,80,1,1,0,0,1,0,21.967120
1,55,1,140,90,3,1,0,0,1,1,34.927679
2,52,1,130,70,3,1,0,0,0,1,23.507805
3,48,2,150,100,1,1,0,0,1,1,28.710479
5,60,1,120,80,2,2,0,0,0,0,29.384676
...,...,...,...,...,...,...,...,...,...,...,...
69994,58,1,150,80,1,1,0,0,1,1,29.384757
69995,53,2,120,80,1,1,1,0,1,0,26.927438
69997,52,2,180,90,3,1,0,1,0,1,31.353579
69998,61,1,135,80,1,2,0,0,0,1,27.099251


In [33]:
from sklearn.model_selection import train_test_split

# splitting the data into train, validation and test sets
X_train, X_test, y_train, y_test = train_test_split(df_2.drop('cardio', axis=1), df_2['cardio'], test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1765, random_state=42)


In [34]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# normalizing the features
normalizer = MinMaxScaler()
X_train = normalizer.fit_transform(X_train)
X_val = normalizer.transform(X_val)
X_test = normalizer.transform(X_test)

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

# hyperparameters for SVM
# not chnge C
#svm_params = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': ['scale', 'auto']}

LinearRegression()

lin_reg_params = {'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False]}

# hyperparameters for RF
#rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 20, None], 'min_samples_split': [2, 5, 10]}

# dictionary of hyperparameters for each model
param_grids = {'SVM': svm_params, 'RF': rf_params}


In [32]:
# dictionary of models to train
models = {"LR": LinearRegression()}

# perform grid search for each model
for model_name, model in models.items():
    grid = GridSearchCV(model, param_grid=param_grids[model_name], scoring='accuracy', cv=5)
    grid.fit(X_train, y_train)
    print(model_name + ': Best Parameters:', grid.best_params_)
    print(model_name + ': Best Score:', grid.best_score_)


Traceback (most recent call last):
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\utils\_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\metrics\_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "c:\Users\b9ivn\.virtualenvs\Machine-

LR: Best Parameters: {'copy_X': True, 'fit_intercept': True}
LR: Best Score: nan


Traceback (most recent call last):
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\metrics\_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\utils\_param_validation.py", line 192, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\b9ivn\.virtualenvs\Machine-learning-Mosm6azX\lib\site-packages\sklearn\metrics\_classification.py", line 221, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "c:\Users\b9ivn\.virtualenvs\Machine-

- Best parameters for SVM: "Gamma": "scale", "kernel": "rbf"
- Tested: "Gamma": ["auto", "scale"]
- Tested: "kernel": ['linear', 'poly', 'rbf', 'sigmoid']
---------------------------------------------------------------------------------------------------
- Best parameters for RF: 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100
- Tested: 'n_estimators': [50, 100, 200]
- Tested: 'max_depth': [5, 10, 20, None]
- Tested: 'min_samples_split': [2, 5, 10]

In [24]:
from sklearn.metrics import accuracy_score

# use best hyperparameters to make predictions on validation data
svm = SVC(C=1, kernel='rbf', gamma='scale')
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_val)

rf = RandomForestClassifier(max_depth=10, min_samples_split=10, n_estimators=100)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_val)

# evaluate performance using accuracy
svm_acc = accuracy_score(y_val, svm_preds)
rf_acc = accuracy_score(y_val, rf_preds)

print('SVM Validation Accuracy:', svm_acc)
print('RF Validation Accuracy:', rf_acc)


SVM Validation Accuracy: 0.7250076585316042
RF Validation Accuracy: 0.7263351373430001


In [16]:
# calculate test accuracy for SVM
svm_test_preds = svm.predict(X_test)
svm_test_acc = accuracy_score(y_test, svm_test_preds)
print('SVM Test Accuracy:', svm_test_acc)

# calculate test accuracy for RF
rf_test_preds = rf.predict(X_test)
rf_test_acc = accuracy_score(y_test, rf_test_preds)
print('RF Test Accuracy:', rf_test_acc)


SVM Test Accuracy: 0.7255642937391482
RF Test Accuracy: 0.727198447553876


In [17]:
# check best parameters for SVM
svm_best_params = grid.best_params_
print('SVM Best Parameters:', svm_best_params)

# check best parameters for RF
rf_best_params = grid.best_params_
print('RF Best Parameters:', rf_best_params)


SVM Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
RF Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
