# Import Package

In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

# Data prepataion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#classification model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# evalutaion
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

# hyper params tuning
from sklearn.model_selection import GridSearchCV

# Data Understanding

In [88]:
# load dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data", header=None,
                 names=["Class","age","menopause","tumor-size","inv-nodes","node-caps","deg-malig","breast","breast-quad","irradiat"])
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [89]:
df.sample(5)

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
200,no-recurrence-events,60-69,ge40,25-29,0-2,no,3,right,left_low,no
37,no-recurrence-events,50-59,ge40,15-19,0-2,no,1,right,central,no
223,recurrence-events,60-69,ge40,20-24,0-2,no,3,right,left_low,no
210,recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
222,recurrence-events,60-69,ge40,25-29,0-2,no,3,left,right_low,yes


In [90]:
# Dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class        286 non-null    object
 1   age          286 non-null    object
 2   menopause    286 non-null    object
 3   tumor-size   286 non-null    object
 4   inv-nodes    286 non-null    object
 5   node-caps    286 non-null    object
 6   deg-malig    286 non-null    int64 
 7   breast       286 non-null    object
 8   breast-quad  286 non-null    object
 9   irradiat     286 non-null    object
dtypes: int64(1), object(9)
memory usage: 22.5+ KB


In [91]:
df.isnull().sum()

Class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

In [92]:
# Statistik deskriptif
df.describe()

Unnamed: 0,deg-malig
count,286.0
mean,2.048951
std,0.738217
min,1.0
25%,2.0
50%,2.0
75%,3.0
max,3.0


In [93]:
df.describe(include=["O"])

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,breast,breast-quad,irradiat
count,286,286,286,286,286,286,286,286,286
unique,2,6,3,11,7,3,2,6,2
top,no-recurrence-events,50-59,premeno,30-34,0-2,no,left,left_low,no
freq,201,96,150,60,213,222,152,110,218


In [94]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [95]:
X.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [96]:
y.head()

0    no-recurrence-events
1    no-recurrence-events
2    no-recurrence-events
3    no-recurrence-events
4    no-recurrence-events
Name: Class, dtype: object

# Train test split

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Data preparation

In [98]:
# Missing values
imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
imputer.fit(X_train)

SimpleImputer(strategy='most_frequent')

In [99]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [100]:
# Encoding  = categorycal -> numerical
# OneHotEncoding methode

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

Models

In [101]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [102]:
y_pred_logreg = logreg.predict(X_test)

# Model Evaluation

In [103]:
print(classification_report(y_pred_logreg, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.88      0.78      0.83        58
   recurrence-events       0.38      0.57      0.46        14

            accuracy                           0.74        72
           macro avg       0.63      0.67      0.64        72
        weighted avg       0.78      0.74      0.75        72



In [105]:
# Cross validation
cross_val_score(logreg, X_train, y_train, cv=5)

array([0.74418605, 0.74418605, 0.60465116, 0.69767442, 0.78571429])

# Model Selection

In [107]:
svc= SVC()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

svc.fit(X_train, y_train)
tree.fit(X_train, y_train)
forest.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)
y_pred_tree = tree.predict(X_test)
y_pred_forest = forest.predict(X_test)

In [108]:
print(classification_report(y_pred_svc, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.98      0.76      0.85        66
   recurrence-events       0.24      0.83      0.37         6

            accuracy                           0.76        72
           macro avg       0.61      0.80      0.61        72
        weighted avg       0.92      0.76      0.81        72



In [109]:
print(classification_report(y_pred_tree, y_test))


                      precision    recall  f1-score   support

no-recurrence-events       0.80      0.79      0.80        52
   recurrence-events       0.48      0.50      0.49        20

            accuracy                           0.71        72
           macro avg       0.64      0.64      0.64        72
        weighted avg       0.71      0.71      0.71        72



In [110]:
print(classification_report(y_pred_forest, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.90      0.78      0.84        59
   recurrence-events       0.38      0.62      0.47        13

            accuracy                           0.75        72
           macro avg       0.64      0.70      0.65        72
        weighted avg       0.81      0.75      0.77        72



# Hyper Param Tuning

In [119]:
svc_linear = SVC(kernel='poly')

In [120]:
svc_linear.fit(X_train, y_train)

SVC(kernel='poly')

In [121]:
y_pred_svc_linear = svc_linear.predict(X_test)

In [122]:
print(classification_report(y_pred_svc_linear, y_test))

                      precision    recall  f1-score   support

no-recurrence-events       0.90      0.78      0.84        59
   recurrence-events       0.38      0.62      0.47        13

            accuracy                           0.75        72
           macro avg       0.64      0.70      0.65        72
        weighted avg       0.81      0.75      0.77        72



In [124]:
params = {
    'C': (0.1, 0.5, 1.0, 10.0),
    'kernel': ('linear','poly','sigmoid','rbf')
}

In [125]:
svc_grid = GridSearchCV(svc, params, cv=5)

In [128]:
svc_grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (0.1, 0.5, 1.0, 10.0),
                         'kernel': ('linear', 'poly', 'sigmoid', 'rbf')})

In [130]:
svc_grid.best_score_

0.7619047619047619

In [131]:
svc_grid.best_params_

{'C': 0.5, 'kernel': 'poly'}

In [132]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [133]:
from pycaret.classification import *

ModuleNotFoundError: No module named 'pycaret'