In [121]:
# Package imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from testCases_v2 import *
import sklearn
import sklearn.datasets
import sklearn.linear_model
from planar_utils import plot_decision_boundary, sigmoid, load_planar_dataset, load_extra_datasets
from sklearn.preprocessing import normalize
%matplotlib inline

np.random.seed(1) # set a seed so that the results are consistent

In [122]:
from sklearn.datasets import load_breast_cancer

In [123]:
data = load_breast_cancer()

In [124]:
# Read the DataFrame, first using the feature data
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add a target column, and fill it with the target data
df['target'] = data.target

In [125]:
#percentage of missing values in each variable
df.isnull().sum()/len(data)*100

mean radius                0.0
mean texture               0.0
mean perimeter             0.0
mean area                  0.0
mean smoothness            0.0
mean compactness           0.0
mean concavity             0.0
mean concave points        0.0
mean symmetry              0.0
mean fractal dimension     0.0
radius error               0.0
texture error              0.0
perimeter error            0.0
area error                 0.0
smoothness error           0.0
compactness error          0.0
concavity error            0.0
concave points error       0.0
symmetry error             0.0
fractal dimension error    0.0
worst radius               0.0
worst texture              0.0
worst perimeter            0.0
worst area                 0.0
worst smoothness           0.0
worst compactness          0.0
worst concavity            0.0
worst concave points       0.0
worst symmetry             0.0
worst fractal dimension    0.0
target                     0.0
dtype: float64

In [126]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [127]:
df.shape

(569, 31)

In [128]:
#data type of variables
df.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [129]:
df['target'].value_counts()

1    357
0    212
Name: target, dtype: int64

In [130]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mean radius,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.11
mean texture,569.0,19.289649,4.301036,9.71,16.17,18.84,21.8,39.28
mean perimeter,569.0,91.969033,24.298981,43.79,75.17,86.24,104.1,188.5
mean area,569.0,654.889104,351.914129,143.5,420.3,551.1,782.7,2501.0
mean smoothness,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634
mean compactness,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454
mean concavity,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268
mean concave points,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012
mean symmetry,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304
mean fractal dimension,569.0,0.062798,0.00706,0.04996,0.0577,0.06154,0.06612,0.09744


In [131]:
normalize = normalize(df)

In [132]:
df_scaled = pd.DataFrame(normalize)
df_scaled.var()

0     1.608613e-05
1     1.226958e-04
2     6.206041e-04
3     2.231692e-03
4     3.424886e-09
5     4.264362e-09
6     5.375150e-09
7     5.408622e-10
8     1.238935e-08
9     1.834541e-09
10    4.321926e-08
11    1.482714e-06
12    2.022068e-06
13    2.363228e-04
14    6.148503e-11
15    8.177024e-10
16    3.081973e-09
17    9.497845e-11
18    3.665304e-10
19    2.831018e-11
20    1.606982e-05
21    2.088535e-04
22    6.313733e-04
23    1.423262e-03
24    6.839613e-09
25    3.071902e-08
26    5.571761e-08
27    2.790645e-09
28    2.880653e-08
29    3.103877e-09
30    6.688050e-07
dtype: float64

In [133]:
#storing the variance and name of variables
variance = df_scaled.var()
columns = df.columns

In [134]:
#saving the names of variables having variance more than a threshold value

variable = [ ]

for i in range(0,len(variance)):
    if variance[i]>=0.0000006: #setting the threshold 
        variable.append(columns[i])

In [135]:
variable

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'texture error',
 'perimeter error',
 'area error',
 'worst radius',
 'worst texture',
 'worst perimeter',
 'worst area',
 'target']

In [136]:
# creating a new dataframe using the above variables
new_df = df[variable]
# first five rows of the new data
new_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,texture error,perimeter error,area error,worst radius,worst texture,worst perimeter,worst area,target
0,17.99,10.38,122.8,1001.0,0.9053,8.589,153.4,25.38,17.33,184.6,2019.0,0
1,20.57,17.77,132.9,1326.0,0.7339,3.398,74.08,24.99,23.41,158.8,1956.0,0
2,19.69,21.25,130.0,1203.0,0.7869,4.585,94.03,23.57,25.53,152.5,1709.0,0
3,11.42,20.38,77.58,386.1,1.156,3.445,27.23,14.91,26.5,98.87,567.7,0
4,20.29,14.34,135.1,1297.0,0.7813,5.438,94.44,22.54,16.67,152.2,1575.0,0


In [137]:
#variance of variables in new data
new_df.var()

mean radius            12.418920
mean texture           18.498909
mean perimeter        590.440480
mean area          123843.554318
texture error           0.304316
perimeter error         4.087896
area error           2069.431583
worst radius           23.360224
worst texture          37.776483
worst perimeter      1129.130847
worst area         324167.385102
target                  0.234177
dtype: float64

In [138]:
# shape of new and original data
new_df.shape, df.shape

((569, 12), (569, 31))

In [139]:
# Add a target column, and fill it with the target data
new_df['target'] = data.target

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['target'] = data.target


# Model with selected variable

In [140]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_df.iloc[:,:-1], new_df.iloc[:,-1], random_state=42)

In [141]:
#Train
model=sklearn.linear_model.LogisticRegression()
model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [142]:
# Print test accuracy
LR_predictions = model.predict(X_test)
print ('Accuracy of logistic regression: %d ' % float((np.dot(y_test,LR_predictions) + np.dot(1-y_test,1-LR_predictions))/float(y_test.size)*100) +
       '% ' + "(percentage of correctly labelled datapoints)")

Accuracy of logistic regression: 96 % (percentage of correctly labelled datapoints)


In [143]:
# Print train accuracy
LR_predictions_train = model.predict(X_train)
print ('Accuracy of logistic regression: %d ' % float((np.dot(y_train,LR_predictions_train) + np.dot(1-y_train,1-LR_predictions_train))/float(y_train.size)*100) +
       '% ' + "(percentage of correctly labelled datapoints)")

Accuracy of logistic regression: 95 % (percentage of correctly labelled datapoints)


In [144]:
#TRAIN SET
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print(accuracy_score(y_train,model.predict(X_train)))
print(confusion_matrix(y_train,model.predict(X_train)))

0.9577464788732394
[[148  10]
 [  8 260]]


In [145]:
#TEST SET
print(accuracy_score(y_test,model.predict(X_test)))
print(confusion_matrix(y_test,model.predict(X_test)))

0.965034965034965
[[51  3]
 [ 2 87]]


In [146]:
df.shape

(569, 31)

# Model with all variable

In [147]:
#split
xtr,xte,ytr,yte=train_test_split(df.iloc[:,:-1],df.iloc[:,-1], random_state=42)

In [148]:
#training with all variable
model_all=sklearn.linear_model.LogisticRegression()
model_all.fit(xtr,ytr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [149]:
#Accuracy train
print(accuracy_score(ytr,model_all.predict(xtr)))
print(confusion_matrix(ytr,model_all.predict(xtr)))

0.9413145539906104
[[142  16]
 [  9 259]]


In [150]:
#Accuracy test
print(accuracy_score(yte,model_all.predict(xte)))
print(confusion_matrix(yte,model_all.predict(xte)))

0.965034965034965
[[51  3]
 [ 2 87]]


# Our model complexity is reduced by feature selection,while using all the features i.e. 30 we get test accuracy of 96.5 whereas after dropping 18 variable we have reduced the complexity of the model without affecting the accuracy of the model which remains 96.5