In [11]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [13]:
data = pd.read_csv('breast_cancer.csv')

In [15]:
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


# Preprocessing

In [18]:
data['Class'] = data['Class'].map({2: 0, 4: 1}) # 0 for benign, 1 for malignant
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


In [20]:
data.columns

Index(['Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [21]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vars = data[['Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
       'Normal Nucleoli', 'Mitoses']]

vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(vars.values, i) for i in range(vars.shape[1])]
vif['Features'] = vars.columns

In [23]:
vif

Unnamed: 0,VIF,Features
0,5.19438,Clump Thickness
1,13.991156,Uniformity of Cell Size
2,14.13173,Uniformity of Cell Shape
3,4.874432,Marginal Adhesion
4,6.879037,Single Epithelial Cell Size
5,4.98447,Bare Nuclei
6,7.526238,Bland Chromatin
7,4.534766,Normal Nucleoli
8,2.560638,Mitoses


# Regression

In [25]:
# Declare x and y
X = data.drop(['Class'], axis=1)
y = data['Class']

In [29]:
# Feature Scaling
scaler = StandardScaler()
scaler.fit(X)

In [31]:
X = scaler.transform(X)

In [33]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
model = LogisticRegression(penalty='l2', C=1.0)
model.fit(X_train, y_train)

In [69]:
# Testing
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

In [71]:
acc = accuracy_score(y_test, y_pred)
print(f'Accuracy: {acc}')

Accuracy: 0.9562043795620438


In [73]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96        79
           1       0.98      0.91      0.95        58

    accuracy                           0.96       137
   macro avg       0.96      0.95      0.95       137
weighted avg       0.96      0.96      0.96       137



In [74]:
print(confusion_matrix(y_test, y_pred))

[[78  1]
 [ 5 53]]


## KNN

In [78]:
from sklearn.neighbors import KNeighborsClassifier

error_rates = []

for i in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    error_rates.append(1 - acc)

best_k = error_rates.index(min(error_rates)) + 1

print('Best k:', best_k)

Best k: 3


In [80]:
knn = KNeighborsClassifier(n_neighbors=best_k).fit(X_train, y_train)

In [82]:
y_predk = knn.predict(X_test)

In [84]:
accuracy_score(y_test, y_predk)

0.9635036496350365

In [86]:
data.tail()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
678,3,1,1,1,3,2,1,1,1,0
679,2,1,1,1,2,1,1,1,1,0
680,5,10,10,3,7,3,8,10,2,1
681,4,8,6,4,3,4,10,6,1,1
682,4,8,8,5,4,5,10,4,1,1


In [88]:
new_df = pd.DataFrame({'Clump Thickness': [3,2,5,4,4], 'Uniformity of Cell Size': [1,1,10,8,8],
                       'Uniformity of Cell Shape': [1,1,10,6,8], 'Marginal Adhesion': [1,1,3,4,5], 
                       'Single Epithelial Cell Size': [3,2,7,3,4], 'Bare Nuclei': [2,1,3,4,5],
                       'Bland Chromatin': [1,1,8,10,10], 'Normal Nucleoli': [1,1,10,6,4], 'Mitoses': [1,1,2,1,1]})

new_df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,3,1,1,1,3,2,1,1,1
1,2,1,1,1,2,1,1,1,1
2,5,10,10,3,7,3,8,10,2
3,4,8,6,4,3,4,10,6,1
4,4,8,8,5,4,5,10,4,1


In [90]:
scaled = scaler.transform(new_df)

new_df['KNN'] = knn.predict(scaled)
new_df['LogisticRegression'] = model.predict(scaled)

In [92]:
new_df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,KNN,LogisticRegression
0,3,1,1,1,3,2,1,1,1,0,0
1,2,1,1,1,2,1,1,1,1,0,0
2,5,10,10,3,7,3,8,10,2,1,1
3,4,8,6,4,3,4,10,6,1,1,1
4,4,8,8,5,4,5,10,4,1,1,1


In [94]:
print("Unique values in y_train:", y_train.unique())
print("Unique values in y_test:", y_test.unique())

Unique values in y_train: [0 1]
Unique values in y_test: [1 0]


In [96]:
X.shape, X_train.shape, X_test.shape

((683, 9), (546, 9), (137, 9))

In [98]:
y.shape, y_train.shape, y_test.shape

((683,), (546,), (137,))