In [75]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import r2_score, confusion_matrix, accuracy_score,mean_squared_error
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy.special import boxcox,boxcox1p, inv_boxcox
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import SelectKBest, chi2, f_classif

In [2]:
df = pd.read_csv('breast_cancer.csv')

In [3]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [36]:
x = df.drop('diagnosis',axis=1)
y = df.diagnosis

In [37]:
x.drop('id',axis=1,inplace=True)

In [38]:
x.corr()[(x.corr() > 0.8)&(x.corr()!=1)]

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
radius_mean,,,0.997855,0.987357,,,,0.822529,,,...,0.969539,,0.965137,0.941082,,,,,,
texture_mean,,,,,,,,,,,...,,0.912045,,,,,,,,
perimeter_mean,0.997855,,,0.986507,,,,0.850977,,,...,0.969476,,0.970387,0.94155,,,,,,
area_mean,0.987357,,0.986507,,,,,0.823269,,,...,0.962746,,0.95912,0.959213,,,,,,
smoothness_mean,,,,,,,,,,,...,,,,,0.805324,,,,,
compactness_mean,,,,,,,0.883121,0.831135,,,...,,,,,,0.865809,0.816275,0.815573,,
concavity_mean,,,,,,0.883121,,0.921391,,,...,,,,,,,0.884103,0.861323,,
concave points_mean,0.822529,,0.850977,0.823269,,0.831135,0.921391,,,,...,0.830318,,0.855923,0.80963,,,,0.910155,,
symmetry_mean,,,,,,,,,,,...,,,,,,,,,,
fractal_dimension_mean,,,,,,,,,,,...,,,,,,,,,,


In [39]:
df_vif,df_vif['features'],df_vif['vif']=pd.DataFrame(),x.columns,[variance_inflation_factor(x,i) for i in range(len(x.columns))]

In [40]:
df_vif.set_index('features',inplace=True)

df_vif.drop('id',inplace=True)

In [42]:
df_vif.sort_values(by='vif',ascending=False,inplace=True)

In [43]:
df_vif

Unnamed: 0_level_0,vif
features,Unnamed: 1_level_1
radius_mean,63306.172036
perimeter_mean,58123.586079
radius_worst,9674.742602
perimeter_worst,4487.78127
area_mean,1287.262339
area_worst,1138.759252
fractal_dimension_mean,629.679874
fractal_dimension_worst,423.396723
smoothness_mean,393.398166
smoothness_worst,375.597155


#### Selecting features by chi-square

In [44]:
model_chi = SelectKBest(chi2,k=2).fit(x,y)

In [45]:
cols = model_chi.get_support(indices=True)

In [46]:
cols

array([ 3, 23])

In [47]:
x_new = x.iloc[:,cols]

In [48]:
x_new.head()

Unnamed: 0,area_mean,area_worst
0,1001.0,2019.0
1,1326.0,1956.0
2,1203.0,1709.0
3,386.1,567.7
4,1297.0,1575.0


### Creating KNN Model

In [50]:
x_train,x_test,y_train,y_test = train_test_split(x_new,y,train_size = 0.8,random_state=100)

##### Finding best value of K

In [51]:
params = {'n_neighbors':range(1,51)}

In [52]:
model_gv = GridSearchCV(KNeighborsClassifier(),params)

In [53]:
model_gv.fit(x_train,y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

In [54]:
model_gv.best_params_

{'n_neighbors': 5}

In [55]:
model_knn = KNeighborsClassifier(n_neighbors=5)

In [56]:
model_knn.fit(x_train,y_train)

KNeighborsClassifier()

In [57]:
pred = model_knn.predict(x_test)

In [59]:
accuracy_score(y_train,model_knn.predict(x_train))

0.9384615384615385

In [58]:
accuracy_score(y_test,pred)

0.9473684210526315

#### Selecting features using ANOVA

In [61]:
mod_aov = SelectKBest(f_classif,k=2).fit(x,y)

In [62]:
cols = mod_aov.get_support(indices=True)

In [63]:
cols

array([22, 27])

In [64]:
x_newer = x.iloc[:,cols]

In [65]:
x_train,x_test,y_train,y_test = train_test_split(x_newer,y,train_size = 0.8,random_state=100)

In [66]:
mod_gv = GridSearchCV(KNeighborsClassifier(),params)

In [68]:
mod_gv.fit(x_train,y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 51)})

In [69]:
mod_gv.best_params_

{'n_neighbors': 22}

In [70]:
mod_knn_new = KNeighborsClassifier(n_neighbors=22).fit(x_train,y_train)

In [71]:
pred_new = mod_knn_new.predict(x_test)

In [73]:
accuracy_score(y_train,mod_knn_new.predict(x_train))

0.9142857142857143

In [72]:
accuracy_score(y_test,pred_new)

0.9385964912280702

###### PCA Dimensional reduction

In [90]:
mod_pca = PCA(n_components=0.95).fit(x_scaled,y)

##### Scaling

In [76]:
x_scaled = MinMaxScaler().fit_transform(x)

In [80]:
x_scaled

array([[0.52103744, 0.0226581 , 0.54598853, ..., 0.91202749, 0.59846245,
        0.41886396],
       [0.64314449, 0.27257355, 0.61578329, ..., 0.63917526, 0.23358959,
        0.22287813],
       [0.60149557, 0.3902604 , 0.59574321, ..., 0.83505155, 0.40370589,
        0.21343303],
       ...,
       [0.45525108, 0.62123774, 0.44578813, ..., 0.48728522, 0.12872068,
        0.1519087 ],
       [0.64456434, 0.66351031, 0.66553797, ..., 0.91065292, 0.49714173,
        0.45231536],
       [0.03686876, 0.50152181, 0.02853984, ..., 0.        , 0.25744136,
        0.10068215]])

In [91]:
mod_pca.explained_variance_ratio_

array([0.53097689, 0.1728349 , 0.07114442, 0.06411259, 0.04086072,
       0.03071494, 0.01580837, 0.01191472, 0.00988429, 0.00945446])

In [83]:
np.sum(mod_pca.explained_variance_ratio_)

0.9577063058117199

In [84]:
pca = PCA(n_components=0.95).fit_transform(x_scaled,y)

In [96]:
x_train,x_test,y_train,y_test = train_test_split(pca,y,train_size = 0.8,random_state=100)

###### Best value of k

In [97]:
mod_gv = GridSearchCV(KNeighborsClassifier(),params).fit(x_train,y_train)

In [98]:
mod_gv.best_params_

{'n_neighbors': 7}

In [99]:
model_knn_new = KNeighborsClassifier(n_neighbors=7).fit(x_train,y_train)

In [101]:
pred_newer = model_knn_new.predict(x_test)

In [104]:
pred_newer_but_train = model_knn_new.predict(x_train)

In [105]:
accuracy_score(y_train,pred_newer_but_train)

0.9802197802197802

In [102]:
accuracy_score(y_test,pred_newer)

0.9649122807017544

In [106]:
confusion_matrix(y_test,pred_newer)

array([[65,  0],
       [ 4, 45]])