# Input Library

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

: 

# Input Data

In [None]:
# 1. Baca data
data = pd.read_csv("heart.csv", sep=";")
print("Ukuran data:", data.shape)
print("5 data teratas:")
display(data.head())

# Informasi Data

In [None]:
# 2. Cek tipe data dan missing value
print("\nInfo dataset:")
data.info()

# Missing Value

In [None]:
print("\nCek missing value:")
print(data.isnull().sum())

## Melihat Distribusi pada data target

In [None]:
# 4. Cek distribusi target
print("\nDistribusi target (HeartDisease):")
print(data["HeartDisease"].value_counts())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Atur style agar lebih rapi
sns.set(style="whitegrid")

In [None]:
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data[col], kde=True, bins=20, color='skyblue', edgecolor='black')
    plt.title(f'Distribusi {col}')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x=data[col], color='lightcoral')
    plt.title(f'Deteksi Outlier: {col}')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5, 4))
sns.countplot(x='HeartDisease', data=data, palette='Set2')
plt.title("Distribusi Kelas Target (Heart Disease)")
plt.xlabel("HeartDisease (0 = Tidak, 1 = Ya)")
plt.ylabel("Jumlah")
plt.show()

In [None]:
# Menghitung jumlah kemunculan setiap kategori pada kolom 'HeartDisease'
heartdisease = data['HeartDisease'].value_counts()

# Membuat pie chart
plt.figure(figsize=(10, 6))
plt.pie(heartdisease, labels=heartdisease.index, autopct='%1.1f%%', startangle=140)
plt.title('Pie Chart dari Kategori Heart Disease')
plt.axis('equal')  # Memastikan lingkaran memiliki aspek yang sama
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='HeartDisease', y=col, data=data, palette='pastel')
    plt.title(f'{col} vs HeartDisease')
plt.tight_layout()
plt.show()


In [None]:
sns.pairplot(data, hue='HeartDisease', diag_kind='kde', corner=True, palette='husl')
plt.suptitle("Hubungan Antar Variabel Berdasarkan HeartDisease", y=1.02)
plt.show()

In [None]:
data.describe().T

In [None]:
# Misalnya hasil dari encoding kamu tadi
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd

categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

encoder = OrdinalEncoder(categories=[
    ['F', 'M'],
    ['NAP', 'ATA', 'TA', 'ASY'],
    ['Normal', 'ST', 'LVH'],
    ['N', 'Y'],
    ['Up', 'Flat', 'Down']
])

data_encoded = data.copy()
data_encoded[categorical_cols] = encoder.fit_transform(data_encoded[categorical_cols])

# ðŸ”¹ Ubah tipe datanya menjadi kategori
for col in categorical_cols:
    data_encoded[col] = data_encoded[col].astype('category')

# Cek hasil
print(data_encoded.dtypes)


In [None]:
print("lima data teratas setelah encoding:")
data_encoded.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
# ========== 3. Scaling fitur numerik dengan MinMaxScaler ==========
scaler = MinMaxScaler()

# Tentukan fitur numerik yang akan diskalakan (selain target)
num_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

# Terapkan scaling
data_encoded[num_cols] = scaler.fit_transform(data_encoded[num_cols])

# ========== 4. Cek hasil ==========
print(data_encoded.head())
print("\nRentang nilai setelah scaling:")
print(data_encoded[num_cols].describe().T[['min', 'max']])


In [None]:
from sklearn.model_selection import train_test_split

X = data_encoded.drop(columns='HeartDisease')
y = data_encoded['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Ukuran data latih:", X_train.shape)
print("Ukuran data uji:", X_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier()

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)
print("Best accuracy:", grid.best_score_)

In [None]:
# Gunakan parameter terbaik
best_rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=0
)

# Latih model dengan seluruh data training
best_rf.fit(X_train, y_train)

# Prediksi data test
y_pred = best_rf.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Evaluasi
print("Accuracy (Test):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
import os
import pickle

# Buat folder 'model' kalau belum ada
os.makedirs("model", exist_ok=True)

# Simpan model
with open("model/random_forest_model.pkl", "wb") as file:
    pickle.dump(best_rf, file)

# Simpan juga scaler (kalau pakai)
with open("model/scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

print("âœ… Model dan scaler berhasil disimpan!")

In [None]:
!pip install streamlit --no-deps
!pip install blinker cachetools click protobuf requests