In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
header_list = ['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 
               'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']

df = pd.read_csv('breast-cancer.csv',  names=header_list)
df = df.sample(frac=1)

In [59]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score

In [34]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [27]:
categorical_features = ['age', 'menopause', 'tumor-size', 'inv-nodes', 
               'node-caps', 'breast', 'breast-quad', 'irradiat']
numeric_features = ['deg-malig']

In [30]:
X, y = df[['age', 'menopause', 'tumor-size', 'inv-nodes', 
         'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']], df['Class']

y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [31]:
X_train.shape

(214, 9)

In [32]:
X_test.shape

(72, 9)

In [84]:
dt = Pipeline(steps=[ ('prepocessor', ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ]
)), ('classification', DecisionTreeClassifier())])

# dt.fit(X_train, y_train)

# print("Accuracy",accuracy_score(y_test, dt.predict(X_test)))
# print("AUC", roc_auc_score(y_test, dt.predict(X_test)))

np.mean(cross_val_score(dt, X, y, cv=5, scoring='accuracy'))

0.653901996370236

In [83]:
np.mean(cross_val_score(dt, X, y, cv=5, scoring='roc_auc'))

0.5724103299856529

In [57]:
rf = Pipeline(steps=[ ('prepocessor', ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ]
)), ('classification', RandomForestClassifier())])

rf.fit(X_train, y_train)

print("Accuracy",accuracy_score(y_test, rf.predict(X_test)))
print("AUC", roc_auc_score(y_test, rf.predict(X_test)))

Accuracy 0.75
AUC 0.6134453781512604


In [87]:
print("Accuracy", np.mean(cross_val_score(rf, X, y, cv=5, scoring='accuracy')))
print("AUC", np.mean(cross_val_score(rf, X, y, cv=5, scoring='roc_auc')))

Accuracy 0.7450090744101634
AUC 0.6521090387374462


In [52]:
knn = Pipeline(steps=[ ('prepocessor', ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ]
)), ('classification', KNeighborsClassifier())])

knn.fit(X_train, y_train)

print("Accuracy",accuracy_score(y_test, knn.predict(X_test)))
print("AUC", roc_auc_score(y_test, knn.predict(X_test)))

Accuracy 0.7083333333333334
AUC 0.5560224089635855


In [88]:
print("Accuracy", np.mean(cross_val_score(knn, X, y, cv=5, scoring='accuracy')))
print("AUC", np.mean(cross_val_score(knn, X, y, cv=5, scoring='roc_auc')))

Accuracy 0.7416212946158499
AUC 0.6341319942611191


In [53]:
svc = Pipeline(steps=[ ('prepocessor', ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ]
)), ('classification', SVC())])

svc.fit(X_train, y_train)

print("Accuracy",accuracy_score(y_test, svc.predict(X_test)))
print("AUC", roc_auc_score(y_test, svc.predict(X_test)))

Accuracy 0.7083333333333334
AUC 0.5280112044817927


In [90]:
print("Accuracy", np.mean(cross_val_score(svc, X, y, cv=3, scoring='accuracy')))
print("AUC", np.mean(cross_val_score(svc, X, y, cv=3, scoring='roc_auc')))

Accuracy 0.7483552631578947
AUC 0.6752996103227704
