In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [5]:
df = pd.read_csv("../Labb/Disease_prediction/cardio_train.csv", sep=";")

df["age"] = round(df["age"]/365).astype(int) 

df['BMI'] = df['weight'] / (df['height']/100)**2

# remove outliers that are below 15 and above 50
df = df[df['BMI'] > 15]
df = df[df['BMI'] < 50]

df['BMI'].min(), df['BMI'].max()

df['BMI_category'] = df['BMI'].apply(lambda x: 1 if x < 25 else 2 if x < 30 else 3)

# removing outliers
# set the limits for systolic blood pressure to 90-200 and for diastolic blood pressure to 60-145
# From what I can find, you have hypotension (low blood pressure) if you go below 90/60.
df = df[df['ap_hi'] > 90]
df = df[df['ap_hi'] < 200] # set it to 200 since the next highest recorded value in the dataset is 197 for systolic blood pressure
# and systolic pressure above 180 is potentially life-threatening, which means not alot of people will have a systolic blood pressure above 200

# diastolic blood pressure limits
# From what I can find, you have hypotension (low blood pressure) if you go below 60 diastolic blood pressure.
df = df[df['ap_lo'] > 60]
df = df[df['ap_lo'] < 145] # set the limit to 145 since the highest recorded value in the dataset is 140 for diastolic blood pressure


df = df[df['ap_hi'] > df['ap_lo']] # removes all rows where the diastolic blood pressure is higher than the systolic blood pressure

for index, row in df.iterrows():
    sys_bp = row['ap_hi']
    dia_bp = row['ap_lo']

    new_col = 'BP_category'

    # Categorize the blood pressure according to the standard guidelines from wikipedia
    if sys_bp < 120 and dia_bp < 80:
        df.at[index, new_col] = 1
    elif sys_bp < 130 and dia_bp < 80:
        df.at[index, new_col] = 2
    elif sys_bp < 140 or dia_bp < 90:
        df.at[index, new_col] = 3
    elif sys_bp < 180 or dia_bp < 120:
        df.at[index, new_col] = 4
    elif sys_bp > 180 or dia_bp > 120:
        df.at[index, new_col] = 5

df_1 = df.drop(['ap_hi', 'ap_lo', 'height', 'weight', 'BMI'], axis=1)
df_2 = df.drop(['height', 'weight', 'BMI_category', 'BP_category'], axis=1)

In [None]:
# Preprocess the data
# Drop the id column
df.drop('id', axis=1, inplace=True)

# Split the dataset into train, validation, and test sets
X = df.drop('cardio', axis=1)
y = df['cardio']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Scale the features using feature standardization and normalization
scaler1 = StandardScaler()
scaler2 = MinMaxScaler()

X_train_scaled1 = scaler1.fit_transform(X_train)
X_train_scaled2 = scaler2.fit_transform(X_train)

X_val_scaled1 = scaler1.transform(X_val)
X_val_scaled2 = scaler2.transform(X_val)

# Define hyperparameters for each model
lr_param_grid = {'C': [0.01, 0.1, 1, 10]}
dt_param_grid = {'max_depth': [None, 5, 10, 20]}
rf_param_grid = {'n_estimators': [100, 500, 1000]}
svm_param_grid = {'C': [0.01, 0.1, 1, 10], 'kernel': ['linear', 'rbf']}
knn_param_grid = {'n_neighbors': [3, 5, 10, 20]}

# Train each model on the training set with different hyperparameters
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()

lr.fit(X_train_scaled1, y_train)
dt.fit(X_train_scaled1, y_train)
rf.fit(X_train_scaled1, y_train)
svm.fit(X_train_scaled1, y_train)
knn.fit(X_train_scaled1, y_train)

# Use GridSearchCV to tune the hyperparameters and choose the best model
lr_grid
