In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score

Below, we read the .csv file using the standard one line of python code, and we print out df to see if it was properly created.

In [2]:
df = pd.read_csv("https://github.com/mpourhoma/CS4661/raw/master/Cancer.csv")

In [3]:
df[0::10]

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Malignant_Cancer
0,5,1,1,1,2,1,3,1,1,0
10,5,3,3,3,2,3,4,4,1,1
20,5,4,4,9,2,10,5,6,1,1
30,9,5,8,1,2,3,2,1,5,1
40,5,3,5,5,3,3,4,10,1,1
50,5,1,3,1,2,1,2,1,1,0
60,2,2,2,1,1,1,7,1,1,0
70,1,1,1,1,2,1,3,1,1,0
80,10,3,5,1,10,5,3,10,2,1
90,1,3,1,2,2,2,5,3,2,0


Next, we create our x and y tables. We then print the new data sets to make sure they were properly constructed.

In [4]:
feat_cols = ['Clump_Thickness','Uniformity_of_Cell_Size','Uniformity_of_Cell_Shape','Marginal_Adhesion','Single_Epithelial_Cell_Size','Bare_Nuclei','Bland_Chromatin','Normal_Nucleoli','Mitoses']
x = df[feat_cols]
y = df['Malignant_Cancer']

In [5]:
print(x.shape)
x

(150, 9)


Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
145,3,1,1,1,2,1,2,1,1
146,9,7,7,5,5,10,7,8,3
147,10,8,8,4,10,10,8,1,1
148,1,1,1,1,2,1,3,1,1


In [6]:
print(y.shape)
y

(150,)


0      0
1      0
2      0
3      0
4      0
      ..
145    0
146    1
147    1
148    0
149    0
Name: Malignant_Cancer, Length: 150, dtype: int64

We now split our dataset into training and testing sets

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.35, random_state = 3)

Next, we use the the training and testing sets with the decision tree classifier

In [8]:
myDT = DecisionTreeClassifier(random_state=5)
myDT.fit(x_train, y_train)
y_predict = myDT.predict(x_test)
accuracy = accuracy_score(y_test,y_predict)
print(accuracy)

0.8301886792452831


Next we perform bagging on a decision tree, using the orignal testing set.

In [12]:
baggerPredictions = []

#def predicter():

for i in range(0,19):
    bootstrap_size = int(0.8*len(x_train))
    xbag, ybag = resample(x_train, y_train, n_samples = bootstrap_size , random_state=i , replace = True) 
    Dtree = DecisionTreeClassifier(random_state=3)
    Dtree.fit(xbag, ybag)
    y_predict = Dtree.predict(x_test)
    #accuracy = accuracy_score(y_test, y_predict)
    #row = [y_predict]
    baggerPredictions.append(y_predict)
    #print(y_predict)
    #print(accuracy)

#print(baggerPredictions)

new_df = pd.DataFrame(baggerPredictions)
new_df = new_df.transpose()
new_df['Final_Vote'] = new_df.mode(numeric_only = True, axis = 1)
#final_predict = new_df['Final_Vote'].values
#Need to figure out accuracy of bagging
new_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,Final_Vote
0,1,1,1,0,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,1,1,0,0,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
7,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
8,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1
9,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


Lastly, we use Random forest to predict on our testing models and print out the accuracy.

In [30]:
myRF = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state = 3)
myRF.fit(x_train, y_train)
y_predict = myRF.predict(x_test)
accuracy = accuracy_score(y_test, y_predict)

print(accuracy)

0.9245283018867925
