# Data Understanding

In [31]:
import pandas as pd
file_path = "heart_disease_uci(in).csv"
df = pd.read_csv(file_path)

In [33]:
print("Data Shape:", df.shape)
print(df.info())
print(df.describe())

Data Shape: (920, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB
None
               id         age    trestbps        chol      thalch     oldpeak  \
count  920.000000  92

In [35]:
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


# Preprocessing

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [42]:
df.dropna(inplace=True)

In [46]:
categorical_features = ["sex", "cp", "restecg", "slope", "thal"]
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])

In [52]:
X = df.drop(columns=["num", "id", "dataset"])
y = df["num"]

In [54]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [58]:
import pickle
with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

# Modeling

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [67]:
models = {"Logistic regression": LogisticRegression(),
          "Random forest": RandomForestClassifier(), "Support Vector Machine": SVC()}
best_model = None
best_f1 = 0

In [69]:
for model_name, model in models.items():
    grid_params = {}
    if model_name == "Logistic Regression":
        grid_params = {"C": [0.1, 1, 10]}
    elif model_name == "Random forest":
        grid_params = {"n_estimators": [100, 200], "max_depth": [10, 20, None]}
    elif model_name == "Support Vector machine":
        grid_params = {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}

In [71]:
grid_search = GridSearchCV(model, grid_params, cv=5, scoring="f1_macro")
grid_search.fit(X_train, y_train)

In [73]:
y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred, average="macro")

In [75]:
print(f"Model: {model_name}")
print(f"Best Params: {grid_search.best_params_}")
print(f"F1 Score: {f1}")
if f1 > best_f1:
    best_f1 = f1
    best_model = grid_search.best_estimator_

Model: Support Vector Machine
Best Params: {}
F1 Score: 0.21353383458646613


In [79]:
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

# Evaluation

In [82]:
y_pred = best_model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [86]:
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Confusion Matrix:
[[33  1  1  0  0]
 [ 5  2  2  4  0]
 [ 2  1  0  2  0]
 [ 1  2  1  0  0]
 [ 1  0  1  1  0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        35
           1       0.33      0.15      0.21        13
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00         3

    accuracy                           0.58        60
   macro avg       0.22      0.22      0.21        60
weighted avg       0.53      0.58      0.55        60

