<a href="https://colab.research.google.com/github/ChaisarAbi/MLSQL/blob/Machine-Learning/Breast_Cancer_SVM_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'breast-cancer-wisconsin-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F180%2F408%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240901%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240901T032647Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db9f2dfecec7cfe1c6e93cf8faaf5ad41836243b4ff63c855fc7211beb409cda0393fba81bf64e9138f8c74402756f21fc90eb9734e1994ee1db2426f87e336ffe18bdec405c7a6057a1192661472d675ff0724e04501f7506c9edbd3caac175d9075f8089f579b040e30bbcc353332e46a41d0b6f742e4ab24875ce8815da6dfa7ca700fec733af48c61dea734e058d265835e857bae0c1235f776a2a88a55ffd8075aa0b27c5c3300e0406148f821d61ef6a7ed0e18cad7864e83cc96bb4796ede659989aaa28b335615a161c0842435f8e2d00198bd7aee3dc321f4a57a22f0d175c0d1d8eddcdb3304b51da7be8e2a921b15d85d5b4dd7d192a0a7ac9fbfa'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading breast-cancer-wisconsin-data, 49796 bytes compressed
Downloaded and uncompressed: breast-cancer-wisconsin-data
Data source import complete.


In [2]:
#import library
import pandas as pd
import numpy as np

#library visualisasi
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# {Pemisah data
from sklearn.model_selection import train_test_split

# data modeling
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# performa data
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn import metrics

#warnings
import warnings
warnings.simplefilter(action='ignore')

In [3]:
#import dataset
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/breast-cancer-wisconsin-data/data.csv


In [None]:
# Dataframe datasets
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
#Info dataframe
df.info()

In [None]:
df.drop(['id', 'Unnamed: 32'], axis=1, inplace = True)

In [None]:
# Periksa tipe data setiap kolom
print(df.dtypes)


In [None]:
# Mengubah kolom diagnosis menjadi numerik dengan B = 0 dan M = 1
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})

print(df['diagnosis'].head())


In [None]:
df.info()

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Mendefinisikan Outlier sebagai data di luar rentang [Q1 - 1.5*IQR, Q3 + 1.5*IQR]
outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))

# Menampilkan baris yang mengandung outlier
outlier_data = df[outliers.any(axis=1)]
outlier_data

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=df, orient='h', palette="Set2")
plt.title('Boxplot Deteksi Outlier')
plt.show()

In [None]:
df_outliers_replaced = df.copy()
df_outliers_replaced = df_outliers_replaced.mask((df_outliers_replaced < (Q1 - 1.5 * IQR)) | (df_outliers_replaced > (Q3 + 1.5 * IQR)), other=df_outliers_replaced.median(), axis=1)


In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=df_outliers_replaced, orient='h', palette="Set2")
plt.title('Boxplot Setelah Penanganan Outlier')
plt.savefig('boxplot_setelah_penanganan_outlier.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=df, orient='h', palette="Set2")
plt.title('Boxplot untuk Deteksi Outlier')
plt.show()

In [None]:
outlier_data.info()

In [None]:
#Menghitung data pada kolom diagnosis
df['diagnosis'].value_counts()

In [None]:
correlation = df.corr()['diagnosis'].sort_values(ascending=False)

# Menampilkan korelasi
print(correlation)

In [None]:
# Membuat plot korelasi
plt.figure(figsize=(10, 8))
sns.barplot(x=correlation.index, y=correlation.values)

# Menambahkan judul dan label
plt.title('Korelasi Fitur dengan Diagnosis')
plt.xlabel('Fitur')
plt.ylabel('Nilai Korelasi')
plt.xticks(rotation=90)
plt.show()


In [None]:

# Menghitung statistik deskriptif
descriptive_stats = df.describe()

# Menyimpan ke file Excel
descriptive_stats.to_excel('descriptive_statistics.xlsx')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Membuat box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='diagnosis', y='concave points_worst', data=df)
plt.title('Distribusi Concave Points Worst terhadap Diagnosis')
plt.xlabel('Diagnosis (0=B, 1=M)')
plt.ylabel('Concave Points Worst')
plt.show()


In [None]:
import pandas as pd

# Mengkategorikan concave points worst berdasarkan diagnosis
concave_points_by_diagnosis = df.groupby('diagnosis')['perimeter_worst'].sum()

# Membuat pie chart
plt.figure(figsize=(8, 8))
concave_points_by_diagnosis.plot(kind='pie', autopct='%1.1f%%')
plt.title('Pie Chart perimeter_worst vs Diagnosis')
plt.ylabel('')  # Menghilangkan label y

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Membuat boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x='diagnosis', y='concave points_worst', data=df)
plt.title('Boxplot Concave Points Worst vs Diagnosis')
plt.xlabel('Diagnosis (B=0, M=1)')
plt.ylabel('Concave Points Worst')
plt.show()


In [None]:
import seaborn as sns

# Menghitung korelasi
correlation = df[['concave points_worst', 'diagnosis']].corr()

# Membuat heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap Korelasi Concave Points Worst vs Diagnosis')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List kolom yang ingin dianalisis
columns = [
    'perimeter_worst',
    'concave points_mean',
    'radius_worst',
    'perimeter_mean',
    'area_worst',
    'radius_mean',
    'area_mean',
    'concavity_mean',
    'concavity_worst'
]

# Membuat boxplot untuk setiap kolom vs diagnosis
for col in columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='diagnosis', y=col, data=df)
    plt.title(f'Distribusi {col} terhadap Diagnosis')
    plt.xlabel('Diagnosis (0=Benign, 1=Malignant)')
    plt.ylabel(col)
    plt.show()


In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split

# Misalkan df adalah DataFrame yang memuat dataset Anda
# X adalah fitur, y adalah label/target
X = df.drop(columns=['diagnosis'])  # Ganti 'label_column' dengan nama kolom label Anda
y = df['diagnosis']

# Membagi data menjadi 70% data latih dan 30% data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Menampilkan hasil pembagian
print(f"Ukuran Data Latih: {X_train.shape[0]}")
print(f"Ukuran Data Uji: {X_test.shape[0]}")


In [None]:
# Pipeline untuk scaling data dan pelatihan SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalisasi data
    ('svm', SVC(kernel='linear', C=1, random_state=42))  # Model SVM dengan kernel linear
])


In [None]:
# Me# Melatih model menggunakan data latih
pipeline.fit(X_train, y_train)

# Memprediksi data latih (untuk evaluasi internal)
y_train_pred = pipeline.predict(X_train)

# Evaluasi model pada data latih
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Akurasi Model pada Data Latih: {accuracy_train:.2f}')

# Menampilkan laporan klasifikasi
print(classification_report(y_train, y_train_pred))


In [None]:
# Menghitung dan menampilkan confusion matrix
cm = confusion_matrix(y_train, y_train_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix pada Data Latih')
plt.savefig('datalatih.png')
plt.show()

# Evaluasi akurasi dan laporan klasifikasi
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Akurasi Model pada Data Latih: {accuracy_train:.3f}')
print(classification_report(y_train, y_train_pred))


In [None]:
# Memprediksi data uji dengan model terbaik dari GridSearch
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Menghitung dan menampilkan confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix SVM Terbaik')
plt.show()


In [None]:
# Inisialisasi model SVM
model_svm = SVC(kernel='linear', random_state=42)

# Melakukan training pada data latih
model_svm.fit(X_train, y_train)

# Memprediksi data uji
y_pred = model_svm.predict(X_test)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi Model SVM: {accuracy:.2f}')

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Pipeline untuk scaling data dan training SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalisasi data
    ('svm', SVC())  # Model SVM
])

# Grid parameter untuk tuning SVM
param_grid = {
    'svm__C': [0.1, 1, 10, 100],  # Regularization parameter
    'svm__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Kernel coefficient
    'svm__kernel': ['rbf', 'linear']  # Jenis kernel
}


In [None]:
# Menjalankan GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Menampilkan hasil terbaik
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.2f}")


In [None]:
# Memprediksi data uji dengan model terbaik
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluasi performa model
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi Model Terbaik pada Data Uji: {accuracy:.3f}')
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Menghitung confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Menampilkan confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix SVM')
plt.savefig('Matrix.png')
plt.show()
