In [1]:
#Imposting the Required Libraries
import pandas as pd #For data Manipulation/Analysis
import numpy as np # For Linear Algebra
from sklearn import preprocessing, model_selection, neighbors #For modeling

In [2]:
#Reading/Importing the Dataset
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data')
data.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [3]:
#Changing the name of the columns
cols = ['Id number', 'Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion',
           'Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
data.columns = cols
data.head()

Unnamed: 0,Id number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [4]:
#Checking for the size of the data
data.shape

(698, 11)

In [5]:
#Checking for missing entries
data.isnull().sum()

Id number                      0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [6]:
#Dropping the Id column
data.drop('Id number', inplace=True, axis=1)
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,4,4,5,7,10,3,2,1,2
1,3,1,1,1,2,2,3,1,1,2
2,6,8,8,1,3,4,3,7,1,2
3,4,1,1,3,2,1,3,1,1,2
4,8,10,10,8,7,10,9,7,1,4


In [7]:
#Checking for unique column entries
data['Clump Thickness'].unique()

array([ 5,  3,  6,  4,  8,  1,  2,  7, 10,  9], dtype=int64)

In [8]:
#Using a loop to iterate through columns
for col in data:
    print(col)
    print(data[col].unique())

Clump Thickness
[ 5  3  6  4  8  1  2  7 10  9]
Uniformity of Cell Size
[ 4  1  8 10  2  3  7  5  6  9]
Uniformity of Cell Shape
[ 4  1  8 10  2  3  5  6  7  9]
Marginal Adhesion
[ 5  1  3  8 10  4  6  2  9  7]
Single Epithelial Cell Size
[ 7  2  3  1  6  4  5  8 10  9]
Bare Nuclei
['10' '2' '4' '1' '3' '9' '7' '?' '5' '8' '6']
Bland Chromatin
[ 3  9  1  2  4  5  7  8  6 10]
Normal Nucleoli
[ 2  1  7  4  5  3 10  6  9  8]
Mitoses
[ 1  5  4  2  3  7 10  8  6]
Class
[2 4]


In [9]:
#checking for data types
data.dtypes

Clump Thickness                 int64
Uniformity of Cell Size         int64
Uniformity of Cell Shape        int64
Marginal Adhesion               int64
Single Epithelial Cell Size     int64
Bare Nuclei                    object
Bland Chromatin                 int64
Normal Nucleoli                 int64
Mitoses                         int64
Class                           int64
dtype: object

In [10]:
#fixing wrong data entry
data['Bare Nuclei'].replace({'?':-999}, inplace=True)
data['Bare Nuclei'].unique()

array(['10', '2', '4', '1', '3', '9', '7', -999, '5', '8', '6'],
      dtype=object)

In [11]:
#Changing data type
data['Bare Nuclei'] = data['Bare Nuclei'].astype(int)
data.dtypes

Clump Thickness                int64
Uniformity of Cell Size        int64
Uniformity of Cell Shape       int64
Marginal Adhesion              int64
Single Epithelial Cell Size    int64
Bare Nuclei                    int32
Bland Chromatin                int64
Normal Nucleoli                int64
Mitoses                        int64
Class                          int64
dtype: object

In [12]:
#Defining X and Y using Pandas
X = data.drop('Class', axis=1)
y = data.Class

In [13]:
#Defining X and Y using Numpy
X = np.array(data.drop(['Class'],1))
y = np.array(data['Class'])

In [14]:
#Splitting the dataset
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size=0.2)
print(X.shape)
print(y.shape)

(698, 9)
(698,)


In [15]:
#Training the classifier
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [16]:
#prediction
prediction = clf.predict(X_test)
accuracy = clf.score(X_test,y_test)
print(accuracy)

0.9357142857142857


In [17]:
#Hyperparameter tuning
clf = neighbors.KNeighborsClassifier(n_neighbors=3, metric='manhattan')
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
accuracy = clf.score(X_test,y_test)
print(accuracy)

0.9357142857142857


In [18]:
#New prediction 
example_measure = np.array([[1,2,2,5,3,6,4,4,8]])
example_measure = example_measure.reshape(len(example_measure),-1)
prediction = clf.predict(example_measure)
print(prediction)

[2]


# '4' above indicates the prediction to be a 'Malignant Cancer type'.