## Import library

In [496]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

## Import data

In [497]:
df = pd.read_csv('Dataset of Diabetes .csv')
df.head()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


## Pembersihan Data

In [498]:
# apakah ada nilai null
null_count = df.isnull().sum()
print(null_count)

ID           0
No_Pation    0
Gender       0
AGE          0
Urea         0
Cr           0
HbA1c        0
Chol         0
TG           0
HDL          0
LDL          0
VLDL         0
BMI          0
CLASS        0
dtype: int64


In [499]:
# melihat apakah tipe datnya sudah sesuai
print(df.dtypes)

ID             int64
No_Pation      int64
Gender        object
AGE            int64
Urea         float64
Cr             int64
HbA1c        float64
Chol         float64
TG           float64
HDL          float64
LDL          float64
VLDL         float64
BMI          float64
CLASS         object
dtype: object


In [500]:
# melihat nilai unik dari kolom kategorikal
print('CLASS sebelum diubah : ', df['CLASS'].unique())
print('Gender sebelum diubah', df['Gender'].unique())

# menghilangkan spasi dalam kolom CLASS agar data konsisten
df['CLASS'] = df['CLASS'].str.strip()

# membuat value gender menjadi uppercase agar data konsisten
df['Gender'] = df['Gender'].str.upper()

print('\nCLASS sesudah diubah', df['CLASS'].unique())
print('Gender sesudah diubah', df['Gender'].unique())

CLASS sebelum diubah :  ['N' 'N ' 'P' 'Y' 'Y ']
Gender sebelum diubah ['F' 'M' 'f']

CLASS sesudah diubah ['N' 'P' 'Y']
Gender sesudah diubah ['F' 'M']


In [501]:
# # Hitung Z-score untuk kolom 'urea'
# mean_urea = np.mean(df['Urea'])
# std_urea = np.std(df['Urea'])
# df['Z-score urea'] = (df['Urea'] - mean_urea) / std_urea

# mean_age = np.mean(df['AGE'])
# std_age = np.std(df['AGE'])
# df['Z-score AGE'] = (df['AGE'] - mean_age) / std_age

# threshold = 3

# # df['Outlier urea'] = np.abs(df['Z-score urea']) > threshold
# # hasil = df[df['Outlier urea'] == 1]
# # print (hasil['Urea'])

# df['Outlier age'] = np.abs(df['Z-score AGE']) > threshold
# hasil = df[df['Outlier age'] == 1]
# print (hasil['AGE'])

In [502]:
# konversi label Gender menjadi numerik menggunakan label encoding
df['Gender'] = df['Gender'].map({'M': 1, 'F': 0})
display(df['Gender'])

0      0
1      1
2      0
3      0
4      1
      ..
995    1
996    1
997    1
998    1
999    1
Name: Gender, Length: 1000, dtype: int64

In [503]:
# mengubah label class menjadi numerik menggunakan one hot encoding
df = pd.get_dummies(df, columns=['CLASS'], drop_first=True)

display(df_encoded[0:10])

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS_P,CLASS_Y
0,502,17975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,False,False
1,735,34221,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,False,False
2,420,47975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,False,False
3,680,87656,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,False,False
4,504,34223,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,False,False
5,634,34224,0,45,2.3,24,4.0,2.9,1.0,1.0,1.5,0.4,21.0,False,False
6,721,34225,0,50,2.0,50,4.0,3.6,1.3,0.9,2.1,0.6,24.0,False,False
7,421,34227,1,48,4.7,47,4.0,2.9,0.8,0.9,1.6,0.4,24.0,False,False
8,670,34229,1,43,2.6,67,4.0,3.8,0.9,2.4,3.7,1.0,21.0,False,False
9,759,34230,0,32,3.6,28,4.0,3.8,2.0,2.4,3.8,1.0,24.0,False,False


In [504]:
# hapus attribute yang tidak perlu
df = df.drop(['ID', 'No_Pation'], axis=1)
df.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS_P,CLASS_Y
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,False,False
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,False,False
2,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,False,False
3,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,False,False
4,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,False,False


## Split data

In [505]:
# Memisah attribute dengan label
X = df.drop(['CLASS_P', 'CLASS_Y'], axis=1)  # attribute (fitur)
y = df[['CLASS_P', 'CLASS_Y']]# label

# Pembagian data: 80% untuk pelatihan dan 20% untuk pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menampilkan ukuran dataset setelah pembagian
print(f'Size of training set: {X_train.shape[0]}')
print(f'Size of test set: {X_test.shape[0]}')

Size of training set: 800
Size of test set: 200


## Melatih Model

In [506]:
# inisialisasi model RandomForestClassifier
model = RandomForestClassifier()

# Melatih model
model.fit(X_train, y_train)

## Menguji model

In [507]:
# Menguji model
y_pred = model.predict(X_test)

# Menghitung akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi model Random Forest: {accuracy:.2f}\n\n')


# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))



Akurasi model Random Forest: 0.99


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      0.99      1.00       173

   micro avg       1.00      0.99      1.00       179
   macro avg       1.00      1.00      1.00       179
weighted avg       1.00      0.99      1.00       179
 samples avg       0.89      0.89      0.89       179



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
