# ABIDZAR_202231015

In [2]:
# Naive Bayes on Diabetes Dataset

# Importing necessary libraries
import pandas as pd  # Untuk manipulasi dan analisis data
from sklearn.model_selection import train_test_split  # Untuk membagi data menjadi train dan test
from sklearn.naive_bayes import GaussianNB  # Model Naive Bayes
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Untuk evaluasi model

In [3]:
# Load Dataset
file_path = 'K02_diabetes.csv'  # path file dataset 
data = pd.read_csv(file_path)

In [4]:
# Display basic information about the dataset
print("Dataset Info:")  # Menampilkan tipe data dan informasi dataset
data.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [5]:
print("\nFirst 5 rows:")  # Menampilkan 5 data teratas
print(data.head())


First 5 rows:
   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [6]:
# Encode categorical variables
# Mengonversi variabel kategorikal ('gender' dan 'smoking_history') menjadi numerik menggunakan one-hot encoding
# drop_first=True digunakan untuk menghindari dummy variable trap
data_encoded = pd.get_dummies(data, columns=['gender', 'smoking_history'], drop_first=True)

In [7]:
# Separate features and target variable
# Memisahkan fitur (X) dan target (y)
X = data_encoded.drop(columns='diabetes')  # Fitur (semua kolom kecuali 'diabetes')
y = data_encoded['diabetes']  # Target (kolom 'diabetes')

In [8]:
# Split data into training and testing sets
# Membagi dataset menjadi data latih (70%) dan data uji (30%) dengan stratifikasi target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [9]:
# Train Naive Bayes Model
# Menginisialisasi dan melatih model Naive Bayes Gaussian
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)  # Melatih model dengan data latih

In [10]:
# Make Predictions
# Membuat prediksi pada data uji
y_pred = nb_model.predict(X_test)

In [11]:
# Evaluate the Model
# Menghitung akurasi model
print("Model Accuracy:", accuracy_score(y_test, y_pred))

Model Accuracy: 0.8649


In [12]:
# Menampilkan Confusion Matrix
# Confusion Matrix memberikan gambaran jumlah prediksi benar dan salah untuk tiap kelas
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[23927  3523]
 [  530  2020]]


In [13]:
# Menampilkan Classification Report
# Classification Report memberikan metrik evaluasi seperti precision, recall, f1-score untuk tiap kelas
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.87      0.92     27450
           1       0.36      0.79      0.50      2550

    accuracy                           0.86     30000
   macro avg       0.67      0.83      0.71     30000
weighted avg       0.93      0.86      0.89     30000

