<a href="https://colab.research.google.com/github/AkuuAlyaaa/UAS_AI/blob/main/UAS_AI_Kel3_Retesting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!ls -lah


total 56K
drwxr-xr-x 1 root root 4.0K Jan 11 23:03 .
drwxr-xr-x 1 root root 4.0K Jan 11 23:02 ..
drwxr-xr-x 4 root root 4.0K Dec  9 14:41 .config
-rw-r--r-- 1 root root  38K Jan 11 23:03 heart.csv
drwxr-xr-x 1 root root 4.0K Dec  9 14:42 sample_data


In [6]:
# Load data
import pandas as pd

df = pd.read_csv("heart.csv")
print("df.shape =", df.shape)
print(df.columns.tolist())
df.head()

df.shape = (1025, 14)
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [7]:
# Cek missing value
(df.isna().sum().sort_values(ascending=False)).head(20)


Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [8]:
# Cleaning
df0 = df.copy()
before = df0.shape[0]

df0 = df0.drop_duplicates()
df0 = df0.dropna()

after = df0.shape[0]
print("Before:", before, "After:", after, "Removed:", before-after)


Before: 1025 After: 302 Removed: 723


In [9]:
# Seleksi fitur seperti jurnal
drop_cols = ["trestbps", "chol", "fbs"]
df1 = df0.drop(columns=[c for c in drop_cols if c in df0.columns])
print("df1.shape =", df1.shape)
df1.head()


df1.shape = (302, 11)


Unnamed: 0,age,sex,cp,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,1,168,0,1.0,2,2,3,0
1,53,1,0,0,155,1,3.1,0,0,3,0
2,70,1,0,1,125,1,2.6,0,0,3,0
3,61,1,0,1,161,0,0.0,2,1,3,0
4,62,0,0,1,106,0,1.9,1,3,2,0


In [19]:
# Pisahkan fitur x danlabel y
y = df1["target"]
X = df1.drop(columns=["target"])

print("X shape:", X.shape)
print("Distribusi kelas:\n", y.value_counts())


X shape: (302, 10)
Distribusi kelas:
 target
1    164
0    138
Name: count, dtype: int64


In [20]:
# Split 80:20 dengan stratify
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (241, 10) Test: (61, 10)


In [21]:
# Preprocessing  (OneHot + Normalisasi) pakai Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# sesuai gaya dataset heart disease umum:
num_cols = ["age", "thalach", "oldpeak"]
cat_cols = [c for c in X.columns if c not in num_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

(num_cols, cat_cols)

(['age', 'thalach', 'oldpeak'],
 ['sex', 'cp', 'restecg', 'exang', 'slope', 'ca', 'thal'])

In [22]:
#  Definisikan 4 model
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

models = {
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000),
}

In [14]:
# Training: 10-fold CV
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_table = []
for name, clf in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", clf)])
    scores = cross_validate(
        pipe, X_train, y_train, cv=cv,
        scoring=["accuracy", "precision", "recall", "f1"]
    )
    cv_table.append({
        "model": name,
        "cv_accuracy": float(np.mean(scores["test_accuracy"])),
        "cv_precision": float(np.mean(scores["test_precision"])),
        "cv_recall": float(np.mean(scores["test_recall"])),
        "cv_f1": float(np.mean(scores["test_f1"])),
    })

cv_table = pd.DataFrame(cv_table).sort_values("cv_accuracy", ascending=False)
cv_table

Unnamed: 0,model,cv_accuracy,cv_precision,cv_recall,cv_f1
1,Random Forest,0.863167,0.868027,0.892857,0.876929
3,Logistic Regression,0.850167,0.86555,0.877473,0.864814
2,Naive Bayes,0.816667,0.852274,0.838462,0.832903
0,Gradient Boosting,0.813333,0.813084,0.862637,0.834675


In [15]:
# Testing: confusion matrix + metrik
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

test_table = []
for name, clf in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", clf)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    cm = confusion_matrix(y_test, pred)
    test_table.append({
        "model": name,
        "accuracy": accuracy_score(y_test, pred),
        "precision": precision_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "f1": f1_score(y_test, pred, zero_division=0),
        "TN_FP_FN_TP": cm.ravel().tolist()
    })

test_table = pd.DataFrame(test_table).sort_values("accuracy", ascending=False)
test_table

Unnamed: 0,model,accuracy,precision,recall,f1,TN_FP_FN_TP
3,Logistic Regression,0.852459,0.875,0.848485,0.861538,"[24, 4, 5, 28]"
2,Naive Bayes,0.819672,0.823529,0.848485,0.835821,"[22, 6, 5, 28]"
1,Random Forest,0.754098,0.78125,0.757576,0.769231,"[21, 7, 8, 25]"
0,Gradient Boosting,0.737705,0.757576,0.757576,0.757576,"[20, 8, 8, 25]"


In [17]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=test_table)

https://docs.google.com/spreadsheets/d/1XlToP-wpC4X3K13eAvl3x1Va5ELAgoV-L_WLzwWMqqw/edit#gid=0


In [18]:
test_table

Unnamed: 0,model,accuracy,precision,recall,f1,TN_FP_FN_TP
3,Logistic Regression,0.852459,0.875,0.848485,0.861538,"[24, 4, 5, 28]"
2,Naive Bayes,0.819672,0.823529,0.848485,0.835821,"[22, 6, 5, 28]"
1,Random Forest,0.754098,0.78125,0.757576,0.769231,"[21, 7, 8, 25]"
0,Gradient Boosting,0.737705,0.757576,0.757576,0.757576,"[20, 8, 8, 25]"
