In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, neighbors, svm
from sklearn.model_selection import cross_validate, train_test_split

In [2]:
df = pd.read_csv ('breast-cancer-wisconsin.csv')

In [3]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhension,single_epith_cell_size,bare_nuclei,bland_chromatin,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
#replacing missing data

df.replace('?', -99999, inplace = True)
#most algorithms recognize (-99999) as a n outlier and would treat it as one

In [5]:
#Dropping unnecessary columns

df.drop(['id'], 1, inplace = True)

In [6]:
#Defining X and y
#X = features
#y = labels

X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

In [7]:
#Doing our cross validation, splitting our data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [8]:
#Defining our classifier

clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [9]:
#Testing our data

accuracy = clf.score(X_test, y_test)
print(accuracy)

0.9714285714285714


In [10]:
#Making a prediction

example_measures = np.array([[4,2,1,1,1,2,3,2,1], [4,2,1,2,2,2,3,2,1]])
example_measures = example_measures.reshape(len(example_measures), -1)

In [11]:
prediction = clf.predict(example_measures)
print(prediction)

[2 2]


#### Day 15

Euclidean Distance

Euclidean distance is the distance between two points defined as the square root of the sum of the squares of the differences between the corresponding coordinates of the points.

Where n is the number of dimensions in the dataset

i is dimensions, p is 1 point and q is another point

In [12]:
from math import sqrt

In [13]:
plot1 = [1,3]
plot2 = [2,5]

euclidean_distance = sqrt ((plot1[0] - plot2[0])**2 + (plot1[1] - plot2[1])**2)

print (euclidean_distance)

2.23606797749979


#### Day 16

Writing our K nearest neighbours algorithm

In [14]:
from matplotlib import style
import warnings
from collections import Counter
style.use('fivethirtyeight')

In [15]:
dataset = {'k':[[1,2],[2,3],[3,1]], 'r': [[6,5],[7,7],[8,6]]}

In [16]:
new_features = [5,7]

In [17]:
# for i in dataset:
#     for ii in dataset [i]:
#         plt.scatter(ii[0], ii[1], s = 100, color = i)
        
#you can also write the above equation as[[plt.scatter(ii[0], ii[1], s = 100, color = i)for ii indataset[i]]for i in dataset]

# plt.scatter(new_features[0], new_features[1])

# plt.show()

#### Day 17

euclidean_distance = sqrt((feature[0]-prediction[0]) ** 2 + (feature[1]-prediction[1]) ** 2)

the above equation for euclidean distance would work when it is a 2 feauture dimensional dataset, but would not work when it is more than a 2 feature dimensions dataset

So we use this instead

euclidean_distance = np.sqrt(np.sum((np.array(features) - np.array(predict)) ** 2 )

The simpler version of this is:

euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))

In [18]:
#defining the k nearest neighbour algorithm

def k_nearest_neighbor(data, predict, k = 3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total warning groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
            distances.append([euclidean_distance, group])
    votes = [i[1]for i in sorted (distances)[:k]]
    #print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]
    
    
#     knnalgos
    return vote_result

result = k_nearest_neighbor(dataset, new_features, k = 3)
print(result)

[('r', 3)]
r


The Objective is to manually create our KNN algorithm and to  do this we would require three arguments ( the data we want to train,  the prediction we want to make and the number of neighbors we want to use) .We need to first understand some basics, such as

1. KNN classifies by choosing the closest data point to our predicting value using Euclidean distance.  The Euclidean distance measures the difference in length between the predicting value and every datapoint (features)
2. The number of neighbors(k) should be greater than the number of classes for instance if I have 2 classes  and I set my K to be 1 . My prediction can have the same Euclidean distance between both classes hence we won't be able to vote  for a class, else if I have k=3 my prediction will definitely have at least two Euclidean distance to a class ,thereby choosing this class .

Going to the code a function is defined with the the basics we will need ( data, prediction,  and neighbors)=> def K_nearest_neigbhors.

If len(data)>=K it will flag a warning i.e the number of class should be less than K.
For the" for loop"  our data here is in form of a dictionary, hence to access the features (dictionary values)  we need a double for loop the first to access the class/group/dictionary keys (for group in data)  second to access the features ( for features in data[group]) .

For each feature there exist a distance (Euclidean distance)  that measures the difference between each features and the new prediction , which is then passed into a list alongside the class/group of the feature ( distance.append(E.D, group)

Since we are only interested in the 3 nearest neighbor we first sort the distance in ascending order and stop at the 3rd index (sorted (distance) [:3].
The resulting list is then iterated over to check the the class with the highest number of votes ..votes =[i[1] for In  the sorted distance...

*note that the list is a list of lists with two elements in the each sublist that contains the distance in the 0th position and class in the 1st position * that's why we are iterating over i[1] not just i or i[0] because we are really interested in the class not the distance
Counter (votes).  Most_common(1) => The counter function counts the number of votes for most common class(1 represents classes)
Vote_result = counter(votes).most_common(1)[0][0]=> since the result gives a list of tuple , where the first element in the tuple is the class and the second element represents the number of votes of the class to access the first tuple and first element in the tuple which represent the class we have [0][0]

#### Day 18

Using our breast cancer dataset to have a broader understanding of k nearest neighbors

In [19]:
import random

In [78]:
def k_nearest_neighbor(data, predict, k = 3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total warning groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
            distances.append([euclidean_distance, group])
    votes = [i[1]for i in sorted (distances)[:k]]
    #print(Counter(votes).most_common(1))
    vote_result = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1] / k
    
    #print(vote_result, confidence)
#     knnalgos
    return vote_result, confidence

In [79]:
df = pd.read_csv ('breast-cancer-wisconsin.csv')

In [80]:
df.replace('?', -99999, inplace = True)

In [81]:
    #Dropping unnecessary columns

df.drop(['id'], 1, inplace = True)

In [82]:
    #converting the contents of the dataset to float

full_data = df.astype(float).values.tolist()

In [83]:
    #shuffling the data so as not to lose the features of the data (this is like scaling)

random.shuffle(full_data)

In [84]:
#a version of train_test_split 

test_size = 0.4
train_set = {2:[], 4:[]}
test_set = {2:[], 4:[]}

#slicing the data
train_data = full_data [:-int(test_size*len(full_data))]
test_data = full_data [-int(test_size*len(full_data)):]

In [85]:
#populating the empty dictionaries

for i in train_data:
    train_set [i[-1]].append(i[:-1])
    
for i in test_data:
    test_set [i[-1]].append(i[:-1])

In [86]:
#passing the information through to k nearest neighbors

correct = 0
total = 0

for group in test_set:
    for data in test_set[group]:
        vote, confidence = k_nearest_neighbor(train_set, data, k=5)
        if group == vote:
            correct += 1
#             else:
#                 print(confidence)
        total += 1

print ('Accuracy:', correct/total)
accuracies.append(correct/total)

# print(sum(accuracies)/len(accuracies))

Accuracy: 0.9605734767025089


### Day 19

K Accuracy and prediction

Increasing K does not necessarily do you a favour

**Confidence vs Accuracy**

Accuracy - Did we get the classification right?

Confidence can come from the classifier

The ratio of the voter result to the value of K is known as the Confidence interval.

When the test size is increased the confidence decreases.

**Some facts about k nearest neighbors**
- k nearest neighbors can be threaded, so you don't have to test each prediction point linearly, you can test each one on their own
- KNN can work on both linear and non linear data.

For linear data, you use regression for classification

For non linear data, you can't do classification, but you can do K nearest neighbors.

### Day 20

## Support Vector Machine (SVM)

SVM is another supervised machine learning classifier. It is the most popular machine learning algorithm.

SVM is a binary classifier so it separates only into 2 groups at a time. The 2 groups are denoted as positive and negative.

The objective of SVM is to find the best separating hyper plane or decision boundary that will separate data.

When you get the best separating hyper plane, you can now take in unknown data, if the unknown data rests on the positive side of the hyperplane, it becomes positive sample, and if it rests on the negative side, it becomes negative.

So the intuition of SVM is to find the best separating hyperplane and then we can classify new datapoints.

The goal of the SVM algorithm is to find the shortest distance to the hyperplane.

In [107]:
df = pd.read_csv ('breast-cancer-wisconsin.csv')

In [108]:
#replacing missing data

df.replace('?', -99999, inplace = True)
#most algorithms recognize (-99999) as a n outlier and would treat it as one

In [109]:
#Dropping unnecessary columns

df.drop(['id'], 1, inplace = True)

In [110]:
#Defining X and y
#X = features
#y = labels

X = np.array(df.drop(['class'], 1))
y = np.array(df['class'])

In [111]:
#Doing our cross validation, splitting our data into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [112]:
#Defining our classifier

clf = svm.SVC()
clf.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [113]:
#Testing our data

accuracy = clf.score(X_test, y_test)
print(accuracy)

0.9714285714285714


In [114]:
#Making a prediction

example_measures = np.array([[4,2,1,1,1,2,3,2,1], [4,2,1,2,2,2,3,2,1]])
example_measures = example_measures.reshape(len(example_measures), -1)

In [115]:
prediction = clf.predict(example_measures)
print(prediction)

[2 2]


### Day 21

**Understanding Vectors**

A vector has both magnitude and direction.

### Day 22

**Support Vector Assertion**

SVM creates a decision boundary, the way a SVM classifies new points once it reaches the decision bounday is by first taking the vector that point perpendicularly to the separating hyperplane(vector w), you would project vector u(unknown data plane) on to vector w, then you would find out what side of the hyper plane vector u is on.

What is the calculation once we've trained a machine learning classifier?

vector u * vector w + b(bias)

If the equation above is >= 0, then it is a positive sample.

If the equation Vector u * vector w + b <= 0, then it is a negative sample.

If vector u * vector w = 0, then it means that it is on the decision boundary.

The unknown, vector u is a feature set comprised of x1 and x2

**How can we make an equation to go through our data and locate support vectors?

We introduce Y(subscript i) - this is the class of the features that we are passing through.

If the class is a + class, then Y (sub i) = +1 or 1

If the class is a - class, then Y (sub i) = -1 

We now multiply Y(sub i) by the equaations we were using to identify the positive and negative support vectors.

+class --> Xi * vector w + b = 1

-class --> Xi * vector w + b = -1

So now we multiply the equations above by Y(sub i) and then we set both equations = 0