In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score, RepeatedStratifiedKFold

In [6]:
#load_dataset
def load_dataset(filename):
    df_data = pd.read_csv(filename, header=None)
    dataset = df_data.values
    X = dataset[:, :-1]
    y = dataset[:, -1]
    X = X.astype(str)
    return X, y

In [7]:
#prepare input
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

In [8]:
#prepare output
def prepare_outputs(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [10]:
X, y = load_dataset('C:/Users/thanh/Documents/Data/breast-cancer.csv')
print(X.shape)

(286, 9)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [12]:
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)

In [13]:
y_train_enc, y_test_enc = prepare_outputs(y_train, y_test)

In [54]:
svm_ml = SVC(kernel='linear')
svm_ml.fit(X_train_enc, y_train_enc)

SVC(kernel='linear')

In [67]:
y_pred = svm_ml.predict(X_test_enc)

acc = metrics.accuracy_score(y_test_enc, y_pred)
print(f'Model acurracy: {acc}')

pres = metrics.precision_score(y_test_enc, y_pred)
print(f'Model precision: {pres}')

rec = metrics.recall_score(y_test_enc, y_pred)
print(f'Model recall score: {rec}')

Model acurracy: 0.627906976744186
Model precision: 0.0
Modelrecall score: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [75]:
svm_ml = SVC(kernel='linear')

cv = KFold(n_splits=3, shuffle=True, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

result = cross_val_score(svm_ml, X_enc, y_enc, cv=cv, scoring='accuracy')

print(f'Accuracy: {result.mean()}')

Accuracy: 0.6819078947368421


In [81]:
svm_ml = SVC(kernel='linear')

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

scores = cross_val_score(svm_ml, X_enc, y_enc, scoring='accuracy', cv=cv, n_jobs=-1)

print(f'Cross validation accuracy score: {scores.mean()}')

Cross validation accuracy score: 0.6969541910331384


In [69]:
lgr_ml = LogisticRegression(solver='lbfgs')

lgr_ml.fit(X_train_enc, y_train_enc)

LogisticRegression()

In [71]:
y_pred = lgr_ml.predict(X_test_enc)

acc = metrics.accuracy_score(y_test_enc, y_pred)
print(f'Model acurracy: {acc}')

pres = metrics.precision_score(y_test_enc, y_pred)
print(f'Model precision: {pres}')

rec = metrics.recall_score(y_test_enc, y_pred)
print(f'Model recall score: {rec}')

Model acurracy: 0.7209302325581395
Model precision: 0.9
Model recall score: 0.28125


In [79]:
lgr_ml = LogisticRegression(solver='lbfgs')

cv = KFold(n_splits=3, shuffle=True, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

result = cross_val_score(lgr_ml, X_enc, y_enc, cv=cv, scoring='accuracy')
print(f'Model cross validation accuracy score: {result.mean()}')

Model cross validation accuracy score: 0.744736842105263


In [82]:
lgr_ml = LogisticRegression(solver='lbfgs')

cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

result = cross_val_score(lgr_ml, X_enc, y_enc, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'Cross validation accuracy score: {result.mean()}')

Model cross validation accuracy score: 0.7301349765258216


In [14]:
#DecisionTreeClassifier
dtc_ml = DecisionTreeClassifier()
dtc_ml.fit(X_train_enc, y_train_enc)

DecisionTreeClassifier()

In [15]:
y_pred = dtc_ml.predict(X_test_enc)

acc = metrics.accuracy_score(y_test_enc, y_pred)
print(f'Model acurracy: {acc}')

pres = metrics.precision_score(y_test_enc, y_pred)
print(f'Model precision: {pres}')

rec = metrics.recall_score(y_test_enc, y_pred)
print(f'Model recall score: {rec}')

Model acurracy: 0.686046511627907
Model precision: 0.6470588235294118
Model recall score: 0.34375


In [17]:
dtc_ml = DecisionTreeClassifier()

cv = KFold(n_splits=3, shuffle=True, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

result = cross_val_score(dtc_ml, X_enc, y_enc, cv=cv, scoring='accuracy')

print(f'Accuracy: {result.mean()}')

Accuracy: 0.6641447368421053


In [22]:
dtc_ml = DecisionTreeClassifier()

cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

result = cross_val_score(dtc_ml, X_enc, y_enc, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'Cross validation accuracy score: {result.mean()}')

Cross validation accuracy score: 0.6462490219092331


In [20]:
#KNeighborsClassifier
knc_ml = KNeighborsClassifier()
knc_ml.fit(X_train_enc, y_train_enc)

KNeighborsClassifier()

In [21]:
y_pred = knc_ml.predict(X_test_enc)

acc = metrics.accuracy_score(y_test_enc, y_pred)
print(f'Model acurracy: {acc}')

pres = metrics.precision_score(y_test_enc, y_pred)
print(f'Model precision: {pres}')

rec = metrics.recall_score(y_test_enc, y_pred)
print(f'Model recall score: {rec}')

Model acurracy: 0.6511627906976745
Model precision: 0.5833333333333334
Model recall score: 0.21875


In [23]:
knc_ml = KNeighborsClassifier()

cv = KFold(n_splits=3, shuffle=True, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

result = cross_val_score(knc_ml, X_enc, y_enc, cv=cv, scoring='accuracy')

print(f'Accuracy: {result.mean()}')

Accuracy: 0.6958698830409357


In [24]:
knc_ml = DecisionTreeClassifier()

cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=10, random_state=1)

oe = OrdinalEncoder()
X_enc = oe.fit_transform(X)
le = LabelEncoder()
y_enc = le.fit_transform(y)

result = cross_val_score(knc_ml, X_enc, y_enc, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'Cross validation accuracy score: {result.mean()}')

Cross validation accuracy score: 0.6399647887323944
