### Lab 7 Classification

<section> Cheng FEI (cf482) </section>

In [210]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Load data

In [179]:
data = load_breast_cancer()

In [180]:
cancer_df = pd.DataFrame(columns=data['feature_names'], data=data['data'])

In [181]:
cancer_df['target'] = data['target']

In [182]:
y = cancer_df.target.values

In [183]:
x = cancer_df.iloc[:, [0,1,4]].values

### Preprocess data

In [184]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [185]:
scx = StandardScaler()
x_train = scx.fit_transform(x_train)
x_test = scx.transform(x_test)

### Initialize results DataFrame

In [186]:
reg_results = pd.DataFrame(index=['Logistic Regression', 'K-nn', 'SVM_linear', 'SVM_rbf', 'Naive Bayes', 'Decision Tree', 'Random Forest'],
                           columns=['X', 'Y', 'Accuracy score (training set)', 'Accuracy score (test set)', 
                                    'Input variables', 'Actual value', 'Predicted value'])

In [187]:
reg_results.iloc[:, 0] = 'mean radius, mean texture, mean smoothness'
reg_results.iloc[:, 1] = 'target'
reg_results.iloc[:, 4] = '17.99, 10.38, 0.11840'
reg_results.iloc[:, 5] = '0'

### 1. Logistic Regression

In [188]:
classifier1 = LogisticRegression()
classifier1.fit(x_train, y_train)

LogisticRegression()

In [189]:
cm1_train = confusion_matrix(y_train, classifier1.predict(x_train))
print(cm1_train)

[[134  15]
 [ 11 238]]


In [190]:
cm1_test = confusion_matrix(y_test, classifier1.predict(x_test))
cm1_test

array([[ 55,   8],
       [  6, 102]])

In [191]:
reg_results.iloc[0, 2] = accuracy_score(y_train, classifier1.predict(x_train))
reg_results.iloc[0, 3] = accuracy_score(y_test, classifier1.predict(x_test))

In [192]:
reg_results.iloc[0, 6] = classifier1.predict(scx.transform([[17.99, 10.38, 0.11840]]))[0]

### 2. K-NN

In [193]:
classifier2 = KNeighborsClassifier(n_neighbors=5)
classifier2.fit(x_train, y_train)

KNeighborsClassifier()

In [194]:
reg_results.iloc[1, 2] = accuracy_score(y_train, classifier2.predict(x_train))
reg_results.iloc[1, 3] = accuracy_score(y_test, classifier2.predict(x_test))
reg_results.iloc[1, 6] = classifier2.predict(scx.transform([[17.99, 10.38, 0.11840]]))[0]

### 3. Linear SVM

In [195]:
classifier3 = SVC(kernel='linear')
classifier3.fit(x_train, y_train)

SVC(kernel='linear')

In [196]:
reg_results.iloc[2, 2] = accuracy_score(y_train, classifier3.predict(x_train))
reg_results.iloc[2, 3] = accuracy_score(y_test, classifier3.predict(x_test))
reg_results.iloc[2, 6] = classifier3.predict(scx.transform([[17.99, 10.38, 0.11840]]))[0]

### 4. Gaussian SVM

In [198]:
classifier4 = SVC(kernel='rbf')
classifier4.fit(x_train, y_train)

SVC()

In [199]:
reg_results.iloc[3, 2] = accuracy_score(y_train, classifier4.predict(x_train))
reg_results.iloc[3, 3] = accuracy_score(y_test, classifier4.predict(x_test))
reg_results.iloc[3, 6] = classifier4.predict(scx.transform([[17.99, 10.38, 0.11840]]))[0]

### 5. Naive Bayes

In [202]:
classifier5 = GaussianNB()
classifier5.fit(x_train, y_train)
reg_results.iloc[4, 2] = accuracy_score(y_train, classifier5.predict(x_train))
reg_results.iloc[4, 3] = accuracy_score(y_test, classifier5.predict(x_test))
reg_results.iloc[4, 6] = classifier5.predict(scx.transform([[17.99, 10.38, 0.11840]]))[0]

### 6. Decision Tree

In [204]:
classifier6 = DecisionTreeClassifier(random_state=0)
classifier6.fit(x_train, y_train)
reg_results.iloc[5, 2] = accuracy_score(y_train, classifier6.predict(x_train))
reg_results.iloc[5, 3] = accuracy_score(y_test, classifier6.predict(x_test))
reg_results.iloc[5, 6] = classifier6.predict(scx.transform([[17.99, 10.38, 0.11840]]))[0]

### 7. Random Forest

In [206]:
classifier7 = RandomForestClassifier(random_state=0)
classifier7.fit(x_train, y_train)
reg_results.iloc[6, 2] = accuracy_score(y_train, classifier7.predict(x_train))
reg_results.iloc[6, 3] = accuracy_score(y_test, classifier7.predict(x_test))
reg_results.iloc[6, 6] = classifier7.predict(scx.transform([[17.99, 10.38, 0.11840]]))[0]

In [211]:
reg_results.to_excel('/Users/chengfei/downloads/ClassifierPerformance.xlsx')

In [212]:
reg_results

Unnamed: 0,X,Y,Accuracy score (training set),Accuracy score (test set),Input variables,Actual value,Predicted value
Logistic Regression,"mean radius, mean texture, mean smoothness",target,0.934673,0.918129,"17.99, 10.38, 0.11840",0,0
K-nn,"mean radius, mean texture, mean smoothness",target,0.937186,0.900585,"17.99, 10.38, 0.11840",0,1
SVM_linear,"mean radius, mean texture, mean smoothness",target,0.937186,0.912281,"17.99, 10.38, 0.11840",0,0
SVM_rbf,"mean radius, mean texture, mean smoothness",target,0.947236,0.929825,"17.99, 10.38, 0.11840",0,0
Naive Bayes,"mean radius, mean texture, mean smoothness",target,0.914573,0.929825,"17.99, 10.38, 0.11840",0,0
Decision Tree,"mean radius, mean texture, mean smoothness",target,1.0,0.906433,"17.99, 10.38, 0.11840",0,0
Random Forest,"mean radius, mean texture, mean smoothness",target,1.0,0.923977,"17.99, 10.38, 0.11840",0,0
