# Build a Machine Learning Classifier in Python with Scikit-learn


# Step 1 — Importing libraries

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_breast_cancer

# # Step 2 — Load Data

The dataset we will be working with in this tutorial is the Breast Cancer Wisconsin Diagnostic Database. The dataset includes various information about breast cancer tumors, as well as classification labels of malignant or benign. The dataset has 569 instances, or data, on 569 tumors and includes information on 30 attributes, or features, such as the radius of the tumor, texture, smoothness, and area.

In [10]:
data=load_breast_cancer()

In [12]:
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

Understand the data

In [14]:
labels_name=data['target_names']
labels=data['target']

In [18]:
labels
labels_name

array(['malignant', 'benign'], dtype='<U9')

In [16]:
feature_name=data['feature_names']
feature=data['data']

In [20]:
feature_name
feature

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

# Step 3 — Organizing Data into Sets

In [32]:
train, test, train_labels, test_label=train_test_split(feature,labels,test_size=.20,random_state=50)

# Step 4 — Building and Evaluating the Model
1-using DecisionTreeClassifier model

In [33]:
tree=DecisionTreeClassifier()
tree.fit(train,train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [34]:
prediction=tree.predict(test)

In [35]:
print(test)

[[1.305e+01 1.859e+01 8.509e+01 ... 1.258e-01 3.113e-01 8.317e-02]
 [1.016e+01 1.959e+01 6.473e+01 ... 2.232e-02 2.262e-01 6.742e-02]
 [1.624e+01 1.877e+01 1.088e+02 ... 1.732e-01 2.770e-01 1.063e-01]
 ...
 [1.665e+01 2.138e+01 1.100e+02 ... 2.095e-01 3.613e-01 9.564e-02]
 [1.719e+01 2.207e+01 1.116e+02 ... 1.984e-01 3.216e-01 7.570e-02]
 [1.240e+01 1.768e+01 8.147e+01 ... 7.370e-02 2.556e-01 9.359e-02]]


# Step 5 — Evaluating the Model's Accuracy

In [36]:
accuracy_score(test_label,prediction)

0.9385964912280702

2-using GaussianNB model

In [37]:
nv=GaussianNB()
nv.fit(train,train_labels)

GaussianNB(priors=None)

In [38]:
nv_predict=nv.predict(test)

In [40]:
accuracy_score(test_label,nv_predict)

0.9473684210526315

3-using KNeighborsClassifier model

In [41]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(train, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [42]:
knn_predict=knn.predict(test)
accuracy_score(test_label,knn_predict)

0.9122807017543859

4-using SVC model

In [45]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(train, train_labels)

svm_predict=knn.predict(test)
accuracy_score(test_label,svm_predict)


0.9122807017543859

# Conclusion
In this tutorial, you learned how to build a machine learning classifier in Python with different models. Now you can load data, organize data, train, predict, and evaluate machine learning classifiers in Python using Scikit-learn.