## SIMPLE CLASSIFICATION PROJECT WITH KNN ALGORITHM 

#### Problem 

The objective of this project is to predict wheather a person has benign or malignant tumor. 



### 1.Import Modules  

In [2]:
#import module
from statistics import mean
import numpy as np
import matplotlib.pyplot as plt 
from matplotlib import style
import pickle
import random 
from sklearn import preprocessing, neighbors
import pandas as pd
from sklearn.model_selection import train_test_split
 

### 2.Load Dataset 

In [3]:
#load the dataset
df = pd.read_csv('../data/breast-cancer-wisconsin.data')

#### Data Attribute Information:
1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)


More information about the dataset can be found here:  [Breast Cancer Dataset,](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Original%29)

### 3.Data Exploratory Analysis 

In [4]:
# Show the list of columns
df.columns

Index(['id', 'clump_thickness', ' unif_cell_size', ' unif_cell_shape',
       'marg_adhesion', 'single_epith_cell_size', 'bare_nuclei', 'bland_chrom',
       'norm_nucleoli', 'mitoses', 'class'],
      dtype='object')

In [5]:
#show the 5 first rows
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [6]:
#show the 5 last rows
df.tail() 

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4
698,897471,4,8,8,5,4,5,10,4,1,4


In [7]:
#summary of a DataFrame.
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      699 non-null    int64 
 1   clump_thickness         699 non-null    int64 
 2    unif_cell_size         699 non-null    int64 
 3    unif_cell_shape        699 non-null    int64 
 4   marg_adhesion           699 non-null    int64 
 5   single_epith_cell_size  699 non-null    int64 
 6   bare_nuclei             699 non-null    object
 7   bland_chrom             699 non-null    int64 
 8   norm_nucleoli           699 non-null    int64 
 9   mitoses                 699 non-null    int64 
 10  class                   699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [8]:
# count missing values 
df.isnull().sum() 

id                        0
clump_thickness           0
 unif_cell_size           0
 unif_cell_shape          0
marg_adhesion             0
single_epith_cell_size    0
bare_nuclei               0
bland_chrom               0
norm_nucleoli             0
mitoses                   0
class                     0
dtype: int64

In [9]:
df['class'].value_counts()     

2    458
4    241
Name: class, dtype: int64

In [10]:
#replace "?" with -99999
df.replace('?', -99999, inplace=True)

In [11]:
#drop id column
df.drop(['id'], axis=1)


### 4. Splitting your Data

In [12]:
#Define x and y 
X = np.array(df.drop(['class'],1))
y=np.array(df['class'])

In [13]:
#do cross validation 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### 5.Training Models

In [14]:
#call our classifer and fit to our data
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [15]:
#test the accuracy
accuracy = clf.score(X_test, y_test)
accuracy

0.9714285714285714

In [16]:
for_testing = X_test.reshape(len(X_test),-1)
prediction = clf.predict(for_testing)
prediction

array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2,
       2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 2, 4, 4, 4, 2, 4, 2,
       4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 2, 2, 4, 2, 4, 2,
       2, 2, 2, 4, 4, 2, 4, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4,
       2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 4, 2, 2,
       2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4,
       2, 2, 4, 2, 4, 4, 2, 2], dtype=int64)