In [1]:
# the dataset
from sklearn.datasets import load_breast_cancer

# classification algorithms to work with
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# some useful libraries
import pandas as pd
import numpy as np

In [2]:
# load the dataset
cancer = load_breast_cancer()

In [3]:
# type
type(cancer)

sklearn.utils.Bunch

In [4]:
# keys
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [5]:
# the actual data
X = cancer.data

# the labels
y = cancer.target

In [6]:
# names of the labels
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [7]:
# shape of the data
X.shape

(569, 30)

In [8]:
y.shape

(569,)

In [9]:
np.bincount(y)

array([212, 357], dtype=int64)

212 'malignant' and 357 'benign'

In [10]:
# any missing values?
pd.isnull(X).sum()

0

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [13]:
# instantiate the classes without any params tunning
dtc = DecisionTreeClassifier(random_state=0)
svc = SVC(random_state=0)
knn = KNeighborsClassifier()
rfc = RandomForestClassifier(random_state=0)

In [14]:
# train the classifiers
dtc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [15]:
# predict and store the predictions
dtc_preds = dtc.predict(X_test)
svc_preds = svc.predict(X_test)
knn_preds = knn.predict(X_test)
rfc_preds = rfc.predict(X_test)

In [16]:
print(f'DTC Accuracy: {np.mean(dtc_preds == y_test)}')
print(f'SVC Accuracy: {np.mean(svc_preds == y_test)}')
print(f'KNN Accuracy: {knn.score(X_test, y_test)}')
print(f'RFC Accuracy: {rfc.score(X_test, y_test)}')

DTC Accuracy: 0.8811188811188811
SVC Accuracy: 0.9370629370629371
KNN Accuracy: 0.9370629370629371
RFC Accuracy: 0.972027972027972


## The Winner? : RANDOM FOREST CLASSIFIER

In [1]:
from sklearn.linear_model import SGDClassifier

In [2]:
SGDClassifier??

[1;31mInit signature:[0m
[0mSGDClassifier[0m[1;33m([0m[1;33m
[0m    [0mloss[0m[1;33m=[0m[1;34m'hinge'[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mpenalty[0m[1;33m=[0m[1;34m'l2'[0m[1;33m,[0m[1;33m
[0m    [0malpha[0m[1;33m=[0m[1;36m0.0001[0m[1;33m,[0m[1;33m
[0m    [0ml1_ratio[0m[1;33m=[0m[1;36m0.15[0m[1;33m,[0m[1;33m
[0m    [0mfit_intercept[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mmax_iter[0m[1;33m=[0m[1;36m1000[0m[1;33m,[0m[1;33m
[0m    [0mtol[0m[1;33m=[0m[1;36m0.001[0m[1;33m,[0m[1;33m
[0m    [0mshuffle[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mepsilon[0m[1;33m=[0m[1;36m0.1[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlearning_rate[0m[1;33m=[0m[1;34