# SVM and Random Forest on Breast Cancer Wisconsin Dataset

### 1. Imports:

In [1]:
import pandas as pd
import numpy as np

### 2. Data:

In [2]:
df = pd.read_csv("breast-cancer-wisconsin.data.txt")
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
df.replace('?', -99999, inplace = True) #making them outliers
df.drop(['id'], axis = 1, inplace = True) #dropping unnecessary columns

### 3. Separate variable and class:

In [4]:
X = np.array(df.drop(['class'],1))
y = np.array(df['class'])

### 4. Split into train and test:

In [5]:
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)



### 5. Apply the svm algo/classifier:

In [6]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
clf1 = SVC()
clf1.fit(X_train, y_train)
train_predict1 = clf1.predict(X_train)
test_predict1 = clf1.predict(X_test)
print("Train Accuracy: ", accuracy_score(train_predict1, y_train))
print("Test Accuracy: ", accuracy_score(test_predict1, y_test))
print("Confusion matrix:\n", confusion_matrix(test_predict1, y_test))

Train Accuracy:  0.998211091234347
Test Accuracy:  0.9714285714285714
Confusion matrix:
 [[84  1]
 [ 3 52]]


### 6. Apply random forest algo/classifier:

In [7]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier()
clf2.fit(X_train, y_train)
train_predict2 = clf2.predict(X_train)
test_predict2 = clf2.predict(X_test)
print("Train Accuracy: ", accuracy_score(train_predict2, y_train))
print("Test Accuracy: ", accuracy_score(test_predict2, y_test))
print("Confusion matrix:\n", confusion_matrix(test_predict2, y_test))

Train Accuracy:  0.9964221824686941
Test Accuracy:  0.9642857142857143
Confusion matrix:
 [[86  4]
 [ 1 49]]
