# Step 1: Import Dataset

In [1]:
import pandas as pd
from IPython.display import display

# Load the dataset
df = pd.read_csv('heart_2020_cleaned.csv')

# Tampilkan beberapa baris pertama dari data untuk memastikan data sudah diimpor dengan benar
print("First 5 rows of the dataset:")
display(df.head())

# Lihat nama kolom yang ada di dalam dataset
print("Columns in the dataset:")
print(df.columns)


First 5 rows of the dataset:


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


Columns in the dataset:
Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')


# Step 2: Handle Missing Values

In [2]:
# Drop or fill missing values
df = df.dropna()

# Tampilkan jumlah baris setelah menghapus missing values
print("Number of rows after dropping missing values:", df.shape[0])


Number of rows after dropping missing values: 319795


# Step 3: Transform and Normalize

In [3]:
from sklearn.preprocessing import StandardScaler

# Pilih fitur numerik yang akan dinormalisasi
numerical_features = ['MentalHealth', 'PhysicalHealth', 'SleepTime']
scaler = StandardScaler()

# Normalisasi fitur numerik yang sudah dipilih
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Tampilkan data setelah normalisasi untuk memastikan berhasil
print("Data after normalization:")
display(df[numerical_features].head(10))


Data after normalization:


Unnamed: 0,MentalHealth,PhysicalHealth,SleepTime
0,3.281069,-0.046751,-1.460354
1,-0.490039,-0.42407,-0.067601
2,3.281069,2.091388,0.628776
3,-0.490039,-0.42407,-0.763977
4,-0.490039,3.097572,0.628776
5,-0.490039,0.330568,3.414282
6,-0.490039,1.462524,-2.15673
7,-0.490039,0.204795,1.325152
8,-0.490039,-0.42407,-1.460354
9,-0.490039,-0.42407,2.021529


# Step 4: Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

# Pilih fitur yang akan digunakan
features = ['Smoking', 'AlcoholDrinking', 'MentalHealth', 'PhysicalHealth', 'SleepTime']
target = 'HeartDisease'

# Encoding untuk variabel kategori (Smoking dan AlcoholDrinking)
label_encoder = LabelEncoder()
df['Smoking'] = label_encoder.fit_transform(df['Smoking'])
df['AlcoholDrinking'] = label_encoder.fit_transform(df['AlcoholDrinking'])
df['HeartDisease'] = label_encoder.fit_transform(df['HeartDisease'])

# Tampilkan DataFrame dengan format tabel yang rapi
print("Data after encoding:")
display(df[features + [target]].head(10))


Data after encoding:


Unnamed: 0,Smoking,AlcoholDrinking,MentalHealth,PhysicalHealth,SleepTime,HeartDisease
0,1,0,3.281069,-0.046751,-1.460354,0
1,0,0,-0.490039,-0.42407,-0.067601,0
2,1,0,3.281069,2.091388,0.628776,0
3,0,0,-0.490039,-0.42407,-0.763977,0
4,0,0,-0.490039,3.097572,0.628776,0
5,1,0,-0.490039,0.330568,3.414282,1
6,0,0,-0.490039,1.462524,-2.15673,0
7,1,0,-0.490039,0.204795,1.325152,0
8,0,0,-0.490039,-0.42407,-1.460354,0
9,0,0,-0.490039,-0.42407,2.021529,0


# Step 5: Build Models

In [5]:
# Pisahkan fitur (X) dan target (y)
X = df[features]
y = df[target]

# Print beberapa data X dan y
print("Features (X):")
display(X.head(10))
print("Target (y):")
display(y.head(10))


Features (X):


Unnamed: 0,Smoking,AlcoholDrinking,MentalHealth,PhysicalHealth,SleepTime
0,1,0,3.281069,-0.046751,-1.460354
1,0,0,-0.490039,-0.42407,-0.067601
2,1,0,3.281069,2.091388,0.628776
3,0,0,-0.490039,-0.42407,-0.763977
4,0,0,-0.490039,3.097572,0.628776
5,1,0,-0.490039,0.330568,3.414282
6,0,0,-0.490039,1.462524,-2.15673
7,1,0,-0.490039,0.204795,1.325152
8,0,0,-0.490039,-0.42407,-1.460354
9,0,0,-0.490039,-0.42407,2.021529


Target (y):


0    0
1    0
2    0
3    0
4    0
5    1
6    0
7    0
8    0
9    0
Name: HeartDisease, dtype: int64

# Estimasi

In [6]:
# Estimasi bisa dilakukan dengan model regresi atau analisis statistik.
# Contoh: menggunakan regresi logistik
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisialisasi model regresi logistik
model = LogisticRegression()

# Latih model
model.fit(X_train, y_train)

# Tampilkan koefisien model
print("Model coefficients:", model.coef_)


Model coefficients: [[ 0.70381312 -0.67859913 -0.07526721  0.40422601  0.05589452]]


# Prediction

In [7]:
# Buat prediksi pada data test
predictions = model.predict(X_test)

# Tampilkan hasil prediksi
print("Predictions on test data:")
display(pd.DataFrame({'Actual': y_test, 'Predicted': predictions}))


Predictions on test data:


Unnamed: 0,Actual,Predicted
271884,0,0
270361,0,0
219060,0,0
24010,0,0
181930,0,0
...,...,...
181387,0,0
13791,0,0
180164,0,0
94526,0,0


# Klasifikasi

In [8]:
from sklearn.metrics import classification_report

# Tampilkan laporan klasifikasi
print("Classification Report:")
print(classification_report(y_test, predictions, zero_division=0))
#Zero_divison bila data itu merusak target atau tak di target manapun.

Classification Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     58367
           1       0.00      0.00      0.00      5592

    accuracy                           0.91     63959
   macro avg       0.46      0.50      0.48     63959
weighted avg       0.83      0.91      0.87     63959

