# Importing Modules

In [12]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler              #for feature scaling
import numpy as np
from matplotlib import pyplot as plt
import time                                                   #to get the time taken while executing something

# Getting Breast Cancer Dataset

In [13]:
ds=datasets.load_breast_cancer()
x=ds.data
y=ds.target
x.shape

(569, 30)

# Feature Scaling

In [14]:
#feature scaling using standard scaler
scaler=StandardScaler()
scaler.fit_transform(x)
x=scaler.transform(x)

# Splitting into Train and Test

In [15]:
x_train,x_test,y_train,y_test=tts(x,y,random_state=0,test_size=0.2)

# Applying Logistic Regression without PCA

In [16]:
clf1=LogisticRegression()
start=time.time()
clf1.fit(x_train,y_train)
end=time.time()
print("time taken for execution=",end-start)
y_pred1=clf1.predict(x_test)
clf1.score(x_test,y_test)

time taken for execution= 0.0039980411529541016




0.9649122807017544

# Deciding Optimal Value of n_components

In [17]:
pca=PCA()
pca.fit_transform(x_train)
total_variance=pca.explained_variance_.sum()
total_variance

29.62561145548469

In [18]:
k=0
cur_variance=0                              #dummy value
while cur_variance/total_variance<0.99:
    cur_variance+=pca.explained_variance_[k]
    k+=1
k                                           #optimal value of k

17

# Applying PCA

In [19]:
pca=PCA(n_components=k)
x_train_new=pca.fit_transform(x_train)
x_test_new=pca.transform(x_test)

# Logistic Regression after PCA

In [20]:
clf2=LogisticRegression()
start=time.time()
clf2.fit(x_train_new,y_train)
end=time.time()
print("time taken for execution=",end-start)
y_pred2=clf2.predict(x_test_new)
clf2.score(x_test_new,y_test)

time taken for execution= 0.003998994827270508




0.9649122807017544

In [21]:
#obviously u don't see too big a time difference because dataset is small, had it been 10^4 or 10^5 datapoints, it might
#show a huge difference.