# SVD (Singular Value Decomposition)

- **Uses:**
    1. Image Comprehension.
    2. Use for Linear Decomposion
    
- **Formulae:**
    - **A = UxSxVT**
    1. U = A.AT
    2. S(Sigma) = Root(eigen Value)
    3. VT = (AT.A)T

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# SVD Using Numpy

In [2]:
A = np.array([[-1,2,0],[2,0,1],[0,-2,1]])
A

array([[-1,  2,  0],
       [ 2,  0,  1],
       [ 0, -2,  1]])

In [3]:
# U,S,VT
from numpy.linalg import svd

U,S,VT = svd(A)

In [4]:
print(U)
print('Sigma',S)
print(VT)

[[-0.67028426 -0.14157742  0.72847433]
 [ 0.39926415 -0.89625169  0.19318659]
 [ 0.62554549  0.42034361  0.65727018]]
Sigma [3.15029268 2.05306258 0.92767992]
[[ 0.46624638 -0.8226726   0.32530617]
 [-0.80412841 -0.54739786 -0.23180398]
 [-0.36877068  0.15351016  0.91675668]]


In [5]:
sigma = np.diag(S)
sigma

array([[3.15029268, 0.        , 0.        ],
       [0.        , 2.05306258, 0.        ],
       [0.        , 0.        , 0.92767992]])

In [6]:

t = U.dot(sigma)
t = A.dot(VT.T)
t

array([[-2.11159159, -0.2906673 ,  0.67579101],
       [ 1.25779892, -1.8400608 ,  0.17921532],
       [ 1.97065138,  0.86299174,  0.60973635]])

# SVD Using Scikit-learn

# Truncated SVD

In [7]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD()
svd.fit(A)
T = svd.transform(A)
T

array([[ 2.11159159,  0.2906673 ],
       [-1.25779892,  1.8400608 ],
       [-1.97065138, -0.86299174]])

# Randomized SVD

In [8]:
from sklearn.utils.extmath import randomized_svd

u,s,vt = randomized_svd(A,n_components=2)

In [9]:
print(u)
print(np.diag(s))
print(vt)

[[ 0.67028426  0.14157742]
 [-0.39926415  0.89625169]
 [-0.62554549 -0.42034361]]
[[3.15029268 0.        ]
 [0.         2.05306258]]
[[-0.46624638  0.8226726  -0.32530617]
 [ 0.80412841  0.54739786  0.23180398]]


In [10]:
T  = u.dot(np.diag(s))
T = A.dot(vt.T)
T

array([[ 2.11159159,  0.2906673 ],
       [-1.25779892,  1.8400608 ],
       [-1.97065138, -0.86299174]])

# SVD On Diabetes dataset

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv('..\\Datasets\\diabetes.csv')

In [13]:

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
x = df.drop(columns = ['Outcome'])
y = df['Outcome']

In [15]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

In [16]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD()
xtrain = svd.fit_transform(xtrain)
xtest = svd.fit_transform(xtest)


In [21]:
model = LogisticRegression()
model.fit(xtrain,ytrain)
ypred = model.predict(xtest)
score_train = model.score(xtrain,ytrain)
print("Train Score: ",score_train)
score_test = model.score(xtest,ytest)
print("Test Score: ",score_test)

Train Score:  0.744299674267101
Test Score:  0.7012987012987013


In [22]:
from sklearn.metrics import accuracy_score
print('Accuracy: ',accuracy_score(ytest,ypred))

Accuracy:  0.7012987012987013


In [23]:
from sklearn.pipeline import Pipeline

In [35]:
comps = list(range(2,8))
# n_components : int, default=2
# Desired dimensionality of output data.
# Must be strictly less than the number of features.
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)
for i in comps:
    model = Pipeline([('svd',TruncatedSVD(n_components=i)),('lr',LogisticRegression())])
    model_fit = model.fit(xtrain,ytrain)
    ypred = model_fit.predict(xtest)
    print(f'with components {i}, Accuracy is: {accuracy_score(ytest,ypred)}')


with components 2, Accuracy is: 0.7207792207792207
with components 3, Accuracy is: 0.7142857142857143
with components 4, Accuracy is: 0.7207792207792207
with components 5, Accuracy is: 0.7467532467532467
with components 6, Accuracy is: 0.7597402597402597
with components 7, Accuracy is: 0.7662337662337663
