## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Importing The Dataset

In [101]:
url = "datasets/breast-cancer-data.csv"
data = pd.read_csv(url)

## Pre-processing Dataset

In [102]:
data.head()

Unnamed: 0,age,menopause,tumer-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiate,class
0,40-49',premeno',15-19',0-2',yes',3',right',left_up',no',recurrence-events'
1,50-59',ge40',15-19',0-2',no',1',right',central',no',no-recurrence-events'
2,50-59',ge40',35-39',0-2',no',2',left',left_low',no',recurrence-events'
3,40-49',premeno',35-39',0-2',yes',3',right',left_low',yes',no-recurrence-events'
4,40-49',premeno',30-34',3-5',yes',2',left',right_up',no',recurrence-events'


In [88]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          286 non-null    object
 1   menopause    286 non-null    object
 2   tumer-size   286 non-null    object
 3   inv-nodes    286 non-null    object
 4   node-caps    278 non-null    object
 5   deg-malig    286 non-null    object
 6   breast       286 non-null    object
 7   breast-quad  285 non-null    object
 8   irradiate    286 non-null    object
 9   class        286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB


In [104]:
print(data.isna().sum())

age            0
menopause      0
tumer-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiate      0
class          0
dtype: int64


In [103]:
print("Duplicae Data: ", data.duplicated().sum())

Duplicae Data:  14


In [105]:
data.drop_duplicates(inplace=True)

In [106]:
data = data.dropna()

In [113]:
print(data.isna().sum())

age                       0
tumer-size                0
inv-nodes                 0
node-caps                 0
deg-malig                 0
breast                    0
irradiate                 0
class                     0
menopause_ge40'           0
menopause_lt40'           0
menopause_premeno'        0
breast-quad_central'      0
breast-quad_left_low'     0
breast-quad_left_up'      0
breast-quad_right_low'    0
breast-quad_right_up'     0
dtype: int64


In [108]:
print("Duplicae Data: ", data.duplicated().sum())

Duplicae Data:  0


In [None]:
# df['Marital Status'] = df['Marital Status'].replace(['Married','Divorced', 'Single ', 'Widowed', 'Separated'], [1,0,2,3,4])

In [109]:
columns_to_clean = ['age', 'tumer-size', 'inv-nodes', 'deg-malig', 'node-caps', 'breast', 'irradiate', 'class']

for column in columns_to_clean:
    data[column] = data[column].str.replace("'", "")


In [110]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_features = ['age', 'tumer-size', 'inv-nodes']
ordinal_encoder = OrdinalEncoder()
data[ordinal_features] = ordinal_encoder.fit_transform(data[ordinal_features])

In [111]:
data = pd.get_dummies(data, columns=['menopause','breast-quad'])

In [112]:
data['node-caps'] = data['node-caps'].map({'yes': 1, 'no': 0})
data['breast'] = data['breast'].map({'right': 1, 'left': 0})
data['irradiate'] = data['irradiate'].map({'yes': 1, 'no': 0})
data['class'] = data['class'].map({'recurrence-events': 1, 'no-recurrence-events': 0})


In [114]:
data = data.astype(int)

In [115]:
data.head()

Unnamed: 0,age,tumer-size,inv-nodes,node-caps,deg-malig,breast,irradiate,class,menopause_ge40',menopause_lt40',menopause_premeno',breast-quad_central',breast-quad_left_low',breast-quad_left_up',breast-quad_right_low',breast-quad_right_up'
0,2,2,0,1,3,1,0,1,0,0,1,0,0,1,0,0
1,3,2,0,0,1,1,0,0,1,0,0,1,0,0,0,0
2,3,6,0,0,2,0,0,1,1,0,0,0,1,0,0,0
3,2,6,0,1,3,1,1,0,0,0,1,0,1,0,0,0
4,2,5,4,1,2,0,0,1,0,0,1,0,0,0,0,1


In [116]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 263 entries, 0 to 285
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   age                     263 non-null    int32
 1   tumer-size              263 non-null    int32
 2   inv-nodes               263 non-null    int32
 3   node-caps               263 non-null    int32
 4   deg-malig               263 non-null    int32
 5   breast                  263 non-null    int32
 6   irradiate               263 non-null    int32
 7   class                   263 non-null    int32
 8   menopause_ge40'         263 non-null    int32
 9   menopause_lt40'         263 non-null    int32
 10  menopause_premeno'      263 non-null    int32
 11  breast-quad_central'    263 non-null    int32
 12  breast-quad_left_low'   263 non-null    int32
 13  breast-quad_left_up'    263 non-null    int32
 14  breast-quad_right_low'  263 non-null    int32
 15  breast-quad_right_up'   263 

## Splitting The Dataset (Base)

In [140]:
x = data.drop(columns=['class'], axis=1).values
y = data['class'].values 

In [142]:
print("Type of x:", type(x))
print("Shape of x:", x.shape)
print("Data type of x elements:", x.dtype)
print("First 5 elements of x:\n", x[:5])

Type of x: <class 'numpy.ndarray'>
Shape of x: (263, 15)
Data type of x elements: int32
First 5 elements of x:
 [[2 2 0 1 3 1 0 0 0 1 0 0 1 0 0]
 [3 2 0 0 1 1 0 1 0 0 1 0 0 0 0]
 [3 6 0 0 2 0 0 1 0 0 0 1 0 0 0]
 [2 6 0 1 3 1 1 0 0 1 0 1 0 0 0]
 [2 5 4 1 2 0 0 0 0 1 0 0 0 0 1]]


In [143]:
print("Type of y:", type(y))
print("Shape of y:", y.shape)
print("Data type of y elements:", y.dtype)
print("First 5 elements of y:\n", y[:5])

Type of y: <class 'numpy.ndarray'>
Shape of y: (263,)
Data type of y elements: int32
First 5 elements of y:
 [1 0 1 0 1]


In [144]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [145]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (210, 15)
X_test shape: (53, 15)
y_train shape: (210,)
y_test shape: (53,)


In [146]:
print("\nX_train sample:")
print(X_train[:1])
print("\nX_test sample:")
print(X_test[:1])
print("\ny_train sample:")
print(y_train[:1])
print("\ny_test sample:")
print(y_test[:1])


X_train sample:
[[2 4 0 0 2 1 0 0 0 1 0 0 1 0 0]]

X_test sample:
[[4 5 0 0 2 0 0 1 0 0 0 0 1 0 0]]

y_train sample:
[0]

y_test sample:
[0]


## Train XGBoost Model On Base Dataset

In [147]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_grid_xgb = {
    'n_estimators': [10, 50, 100, 200], 
    'max_depth': [3, 5, 7], 
    'learning_rate': [0.01, 0.1, 0.2], 
    'subsample': [0.6, 0.8, 1.0]
}

grid_search_xgb = GridSearchCV(xgb.XGBClassifier(eval_metric='logloss'), param_grid_xgb, cv=5)
grid_search_xgb.fit(X_train, y_train)
best_params_xgb = grid_search_xgb.best_params_
print(f'Best XGB parameters: {best_params_xgb}')

Best XGB parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.6}


In [148]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBClassifier(
    n_estimators=50,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.6,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

## Base Model Evaluation (XGBoost)

In [149]:
y_pred = xgb_model.predict(X_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('XGBoost Model Performance On Original Dataset')
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

Accuracy: 0.70
Precision: 0.36
Recall: 0.42
F1-score: 0.38


## Scaling The Dataset

In [150]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features = data.iloc[:, 2:].values
features_scaled = scaler.fit_transform(features)

In [165]:
print("\nfeatures_scaled sample:")
print(features_scaled[:1])
print("\ny sample:")
print(y[:1])


features_scaled sample:
[[-0.5544002   1.94468974  1.28245888  1.03482662 -0.54952725  1.55421505
  -0.88832181 -0.13921151  0.92309949 -0.28688766 -0.77695466  1.39823324
  -0.30956959 -0.37219368]]

y sample:
[1]


## PCA

In [154]:
print("Features scaled shape:", features_scaled.shape)
print("Target shape:", y.shape)
print("Any NaN in features scaled:", np.isnan(features_scaled).any())
print("Any NaN in target:", np.isnan(y).any())

Features scaled shape: (263, 14)
Target shape: (263,)
Any NaN in features scaled: False
Any NaN in target: False


In [166]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Gunakan parameter XGBoost terbaik yang ditemukan
xgb_base_model = xgb.XGBClassifier(**best_params_xgb, eval_metric='logloss')

# Definisikan pipeline dengan PCA dan XGBoost
pipeline_pca = Pipeline([
    ('pca', PCA()),
    ('xgb', xgb_base_model)
])

# Definisikan parameter grid untuk PCA
param_grid_pca = {
    'pca__n_components': [2, 4, 6, 8, 10, 12]
}

# Lakukan Grid Search dengan pipeline
grid_search_pca = GridSearchCV(pipeline_pca, param_grid_pca, cv=5)
grid_search_pca.fit(features_scaled, y)

# Dapatkan parameter terbaik
best_params_pca = grid_search_pca.best_params_
print(f'Best PCA parameters: {best_params_pca}')

Best PCA parameters: {'pca__n_components': 12}


In [167]:
from sklearn.decomposition import PCA

# Inisialisasi dan transformasi dengan PCA
pca_model = PCA(n_components=12)
features_reduced_pca = pca_model.fit_transform(features_scaled)

# Memeriksa bentuk hasil transformasi
print(f'Original shape: {features.shape}')
print(f'PCA reduced shape: {features_reduced_pca.shape}')

Original shape: (263, 14)
PCA reduced shape: (263, 12)


## Splitting The Dataset (After Feature Reduced With PCA)

In [168]:
X_pca_train, X_pca_test, y_pca_train, y_pca_test = train_test_split(features_reduced_pca, y, test_size=0.2, random_state=42)

In [169]:
print("X_pca_train shape:", X_pca_train.shape)
print("X_pca_test shape:", X_pca_test.shape)
print("y_pca_train shape:", y_pca_train.shape)
print("y_pca_test shape:", y_pca_test.shape)

X_pca_train shape: (210, 12)
X_pca_test shape: (53, 12)
y_pca_train shape: (210,)
y_pca_test shape: (53,)


In [170]:
print("\nX_pca_train sample:")
print(X_pca_train[:1])
print("\nX_pca_test sample:")
print(X_pca_test[:1])
print("\ny_pca_train sample:")
print(y_pca_train[:1])
print("\ny_pca_test sample:")
print(y_pca_test[:1])


X_pca_train sample:
[[-0.68019712 -1.2369195  -1.83656196 -0.96940049 -0.25729639  0.30005558
  -0.2136405   0.06955802 -0.3469221   0.24330157  0.45754579 -0.05024788]]

X_pca_test sample:
[[-1.36228081  1.54068221 -0.74555062 -1.01010504  0.24019156 -0.23964818
  -0.04326973  0.46725204 -0.61603149 -1.01299528 -0.10662146  0.08937166]]

y_pca_train sample:
[0]

y_pca_test sample:
[0]


## Train The Model (After Feature Reduced With PCA) 

In [171]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_model_pca = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.6,
    eval_metric='logloss'
)

xgb_model_pca.fit(X_pca_train, y_pca_train)

In [172]:
y_pca_pred = xgb_model_pca.predict(X_pca_test) 
accuracy = accuracy_score(y_pca_test, y_pca_pred)
precision = precision_score(y_pca_test, y_pca_pred)
recall = recall_score(y_pca_test, y_pca_pred)
f1 = f1_score(y_pca_test, y_pca_pred)

print('XGBoost Model Performance On PCA Reduced Dataset')
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

XGBoost Model Performance On PCA Reduced Dataset
Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00


## UMAP

In [173]:
import umap.umap_ as umap
from sklearn.pipeline import Pipeline

# Gunakan parameter XGBoost terbaik yang ditemukan
xgb_base_model = xgb.XGBClassifier(**best_params_xgb, eval_metric='logloss')

# Definisikan pipeline dengan UMAP dan XGBoost
pipeline_umap = Pipeline([
    ('umap', umap.UMAP()),
    ('xgb', xgb_base_model)
])

# Definisikan parameter grid untuk UMAP dan XGBoost
param_grid_umap = {
    'umap__n_neighbors': [5, 10, 15, 20],
    'umap__n_components': [2, 5, 12],
    'umap__min_dist': [0.1, 0.3, 0.5],
    'umap__metric': ['euclidean', 'manhattan'],
    'umap__n_epochs': [200, 400],
    'umap__init': ['random']  
}

# Lakukan Grid Search dengan pipeline
grid_search_umap = GridSearchCV(pipeline_umap, param_grid_umap, cv=5)
grid_search_umap.fit(features_scaled, y)

# Dapatkan parameter terbaik
best_params_umap = grid_search_umap.best_params_
print(f'Best UMAP parameters: {best_params_umap}')

Best UMAP parameters: {'umap__metric': 'manhattan', 'umap__min_dist': 0.0, 'umap__n_components': 5, 'umap__n_neighbors': 20}


In [174]:
import umap.umap_ as umap

# Inisialisasi dan transformasi dengan UMAP
umap_model= umap.UMAP(
    n_neighbors=20,
    n_components=5,
    min_dist=0.0,
    metric='manhattan'
)
features_reduced_umap = umap_model.fit_transform(features_scaled)

# Memeriksa bentuk hasil transformasi
print(f'Original shape: {features.shape}')
print(f'UMAP reduced shape: {features_reduced_umap.shape}')

Original shape: (263, 14)
UMAP reduced shape: (263, 5)


## Splitting The Dataset (After Feature Reduced With UMAP)

In [175]:
X_umap_train, X_umap_test, y_umap_train, y_umap_test = train_test_split(features_reduced_umap, y, test_size=0.2, random_state=42)

In [176]:
print("X_umap_train shape:", X_umap_train.shape)
print("X_umap_test shape:", X_umap_test.shape)
print("y_umap_train shape:", y_umap_train.shape)
print("y_umap_test shape:", y_umap_test.shape)

X_umap_train shape: (210, 5)
X_umap_test shape: (53, 5)
y_umap_train shape: (210,)
y_umap_test shape: (53,)


In [177]:
print("\nX_umap_train sample:")
print(X_umap_train[:1])
print("\nX_umap_test sample:")
print(X_umap_test[:1])
print("\ny_umap_train sample:")
print(y_umap_train[:1])
print("\ny_umap_test sample:")
print(y_umap_test[:1])


X_umap_train sample:
[[6.729919  2.9190426 3.4825985 7.7910295 3.3535068]]

X_umap_test sample:
[[9.086993  3.352323  9.2975855 3.750746  6.5453606]]

y_umap_train sample:
[0]

y_umap_test sample:
[0]


## Train The Model (After Feature Reduced With UMAP) 

In [178]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_model_umap = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.1,
    subsample=0.6,
    eval_metric='logloss'
)

xgb_model_umap.fit(X_umap_train, y_umap_train)

In [179]:
y_umap_pred = xgb_model_umap.predict(X_umap_test) 
accuracy = accuracy_score(y_umap_test, y_umap_pred)
precision = precision_score(y_umap_test, y_umap_pred)
recall = recall_score(y_umap_test, y_umap_pred)
f1 = f1_score(y_umap_test, y_umap_pred)

print('XGBoost Model Performance On UMAP Reduced Dataset')
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

XGBoost Model Performance On UMAP Reduced Dataset
Accuracy: 0.85
Precision: 0.62
Recall: 0.83
F1-score: 0.71
