In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt


df = pd.read_csv("heart.csv")

In [16]:
df['RestingBP'] = df['RestingBP'].replace(0, df['RestingBP'].median())
df['Cholesterol'] = df['Cholesterol'].replace(0, df['Cholesterol'].median())

In [17]:
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]


In [18]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include="object").columns
# odvajamo numericke i logicke kolone - jer cemo skalirati numericke i kodirati kategoricke kolone

In [19]:
# skaliranje i kodiranje
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# 80% ulaznih podataka za treniranje modela
# 20% ulaznih podataka za testiranje modela

In [21]:
# 🏗️ Fit transform train, transform test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


# fit transfrom uci kako da skalira i kodira podatke
# transform koristi naucene postavke da isto transformise test podatke


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

X_train_transformed = preprocessor.fit_transform(X_train)


ohe = preprocessor.named_transformers_['cat']
ohe_features = ohe.get_feature_names_out(categorical_cols)


feature_names = list(numeric_cols) + list(ohe_features)

X_vif = pd.DataFrame(X_train_transformed, columns=feature_names)


vif_data = pd.DataFrame()
vif_data["Feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]


print(vif_data.sort_values("VIF", ascending=False))


# VIF < 5
# nema ozbiljne multikolinearnosti medju atributima


              Feature       VIF
14        ST_Slope_Up  4.875019
13      ST_Slope_Flat  4.541284
6               Sex_M  4.408214
10  RestingECG_Normal  3.943031
12   ExerciseAngina_Y  2.538123
11      RestingECG_ST  1.972557
7   ChestPainType_ATA  1.770659
4               MaxHR  1.546242
8   ChestPainType_NAP  1.540727
5             Oldpeak  1.454105
0                 Age  1.382737
9    ChestPainType_TA  1.155632
3           FastingBS  1.113203
1           RestingBP  1.102688
2         Cholesterol  1.064634


In [22]:
import joblib

joblib.dump(df, "heart_clean_df.pkl")
joblib.dump((X_train_processed, X_test_processed, y_train, y_test), "train_test_processed.pkl")


['train_test_processed.pkl']