Nama: Afif Pristantio


NIM : A11.2020.12993

In [1]:
# Mengimpor library yang diperlukan
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Mengimpor dataset dari file CSV
df = pd.read_csv('Breast Cancer Wisconsin (Diagnostic).csv')

# Menampilkan dataset
print(df)

           id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302         M        17.99         10.38          122.80     1001.0   
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compa

In [2]:
# Menghapus kolom yang tidak relevan
df.drop(['id'], axis=1, inplace=True)

# Mengubah label kelas menjadi numerik
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Memisahkan fitur dan label
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Menampilkan fitur dan label
print(X)
print(y)

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concave points_mean  sym

In [3]:
# Membagi dataset menjadi data pelatihan dan data pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mengisi nilai yang hilang menggunakan imputer
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Melakukan normalisasi menggunakan StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Memilih fitur terbaik menggunakan SelectKBest dan ANOVA F-value
feature_selector = SelectKBest(f_classif, k=10)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)

# Menampilkan fitur terbaik
print(X_train)
print(X_test)

[[-1.44075296 -1.36208497 -1.1391179  ... -0.97396758  4.67282796
   0.9320124 ]
 [ 1.97409619  2.09167167  1.85197292 ...  2.1374055   1.92862053
   2.6989469 ]
 [-1.39998202 -1.34520926 -1.10978518 ... -0.99971493 -0.64491059
  -0.97023893]
 ...
 [ 0.04880192 -0.06512547 -0.06142266 ... -0.33927781 -1.12796759
  -1.23903365]
 [-0.03896885 -0.03137406 -0.15477966 ... -0.01355621  0.63253231
   1.05001236]
 [-0.54860557 -0.60350155 -0.5518995  ... -0.58052694 -0.76538799
  -0.61102866]]
[[-0.46649743 -0.44421138 -0.48646498 ... -0.35109337 -0.03583041
  -0.19435087]
 [ 1.36536344  1.30551088  1.34147086 ...  1.7441412  -0.02818105
   0.99177862]
 [ 0.38006578  0.40410139  0.26659607 ...  0.49204412  0.4996247
   0.57035018]
 ...
 [-0.73547237 -0.74138839 -0.69969126 ... -0.67928389 -0.45033003
  -0.27741059]
 [ 0.02898271  0.0274851  -0.0859606  ... -0.19925456  0.15205697
  -0.49027026]
 [ 1.87216885  1.80354992  2.15658189 ...  4.50580885  0.33277307
   0.7925579 ]]


In [4]:
# Naive Bayes
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

# Melakukan prediksi pada data pengujian dengan Naive Bayes
y_pred_nb = naive_bayes.predict(X_test)

# Evaluasi performa model Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)
roc_auc_nb = roc_auc_score(y_test, y_pred_nb)

# Menampilkan metrik evaluasi Naive Bayes
print("Naive Bayes - Accuracy:", accuracy_nb)
print("Naive Bayes - Precision:", precision_nb)
print("Naive Bayes - Recall:", recall_nb)
print("Naive Bayes - F1 Score:", f1_nb)
print("Naive Bayes - ROC AUC Score:", roc_auc_nb)

Naive Bayes - Accuracy: 0.9736842105263158
Naive Bayes - Precision: 1.0
Naive Bayes - Recall: 0.9302325581395349
Naive Bayes - F1 Score: 0.963855421686747
Naive Bayes - ROC AUC Score: 0.9651162790697674


In [5]:
# KNN
knn = KNeighborsClassifier()
# Melakukan tuning hyperparameter menggunakan GridSearchCV untuk KNN
knn_params = {'n_neighbors': [3, 5, 7], 'p': [1, 2]}
grid_search_knn = GridSearchCV(knn, knn_params)
grid_search_knn.fit(X_train, y_train)

# Melakukan prediksi pada data pengujian dengan KNN yang sudah dituning
y_pred_knn = grid_search_knn.predict(X_test)

# Evaluasi performa model KNN yang sudah dituning
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
roc_auc_knn = roc_auc_score(y_test, y_pred_knn)

# Menampilkan metrik evaluasi KNN yang sudah dituning
print("KNN - Accuracy:", accuracy_knn)
print("KNN - Precision:", precision_knn)
print("KNN - Recall:", recall_knn)
print("KNN - F1 Score:", f1_knn)
print("KNN - ROC AUC Score:", roc_auc_knn)

KNN - Accuracy: 0.9736842105263158
KNN - Precision: 0.9545454545454546
KNN - Recall: 0.9767441860465116
KNN - F1 Score: 0.9655172413793104
KNN - ROC AUC Score: 0.9742875859810023


In [6]:
# Decision Tree
dt = DecisionTreeClassifier()
# Melakukan tuning hyperparameter menggunakan GridSearchCV untuk Decision Tree
dt_params = {'max_depth': [None, 5, 10, 15]}
grid_search_dt = GridSearchCV(dt, dt_params)
grid_search_dt.fit(X_train, y_train)

# Melakukan prediksi pada data pengujian dengan Decision Tree yang sudah dituning
y_pred_dt = grid_search_dt.predict(X_test)

# Evaluasi performa model Decision Tree yang sudah dituning
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)

# Menampilkan metrik evaluasi Decision Tree yang sudah dituning
print("Decision Tree - Accuracy:", accuracy_dt)
print("Decision Tree - Precision:", precision_dt)
print("Decision Tree - Recall:", recall_dt)
print("Decision Tree - F1 Score:", f1_dt)
print("Decision Tree - ROC AUC Score:", roc_auc_dt)

Decision Tree - Accuracy: 0.9210526315789473
Decision Tree - Precision: 0.9473684210526315
Decision Tree - Recall: 0.8372093023255814
Decision Tree - F1 Score: 0.8888888888888888
Decision Tree - ROC AUC Score: 0.9045201441205373


In [7]:
# Menampilkan hasil prediksi klasifikasi untuk setiap data dari ketiga algoritma yang sudah dituning
prediction_results = pd.DataFrame({'Actual': y_test, 'Naive Bayes': y_pred_nb, 'KNN': y_pred_knn, 'Decision Tree': y_pred_dt})
print(prediction_results)

     Actual  Naive Bayes  KNN  Decision Tree
204       0            0    0              0
70        1            1    1              1
131       1            1    1              1
431       0            0    0              0
540       0            0    0              0
..      ...          ...  ...            ...
486       0            0    0              0
75        1            1    1              1
249       0            0    0              0
238       0            0    0              0
265       1            1    1              1

[114 rows x 4 columns]


In [8]:
# Menghitung jumlah hasil yang benar (akurat) dari ketiga algoritma
accurate_count_nb = (y_test == y_pred_nb).sum()
accurate_count_knn = (y_test == y_pred_knn).sum()
accurate_count_dt = (y_test == y_pred_dt).sum()

# Menampilkan jumlah hasil yang akurat dari ketiga algoritma
print("Jumlah Prediksi yang Akurat - Naive Bayes  :", accurate_count_nb)
print("Jumlah Prediksi yang Akurat - KNN          :", accurate_count_knn)
print("Jumlah Prediksi yang Akurat - Decision Tree:", accurate_count_dt)

# Menentukan algoritma terbaik berdasarkan jumlah hasil yang akurat
best_algorithm = ""
if accurate_count_nb >= accurate_count_knn and accurate_count_nb >= accurate_count_dt:
    best_algorithm = "Naive Bayes"
elif accurate_count_knn >= accurate_count_dt:
    best_algorithm = "KNN"
else:
    best_algorithm = "Decision Tree"

print("Algoritma terbaik adalah", best_algorithm)

Jumlah Prediksi yang Akurat - Naive Bayes  : 111
Jumlah Prediksi yang Akurat - KNN          : 111
Jumlah Prediksi yang Akurat - Decision Tree: 105
Algoritma terbaik adalah Naive Bayes
