In [297]:
import numpy as np
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification


In [298]:
def get_dataset():
    x,y=make_classification(n_samples=1000,n_features=20,n_informative=15,n_redundant=2)
    return x,y

In [299]:
x,y=get_dataset()
x.shape

(1000, 20)

In [300]:
y.shape

(1000,)

In [301]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

# get a list of  model to evaluate 
def get_models():
    models=dict()
    for i in range(20):
        steps=[('svd',TruncatedSVD(n_components=i)),('m',LogisticRegression())]
        models[str(i)]=Pipeline(steps=steps)
        return models

In [302]:
from sklearn.model_selection import RepeatedStratifiedKFold

#evaluate a given model using cross-validation

def evaluate_model(model,x,y):
    cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=1)
    scores=cross_val_score(model,x,y,scoring='accuracy',cv=cv,n_jobs=-1,error_score=0)
    return scores

In [303]:
from sklearn.model_selection import cross_val_score
#define dataset
x,y=get_dataset()
#get the model to evaluate

models=get_models()
#evaluate the models and store result

results,names=list(),list()
for name, model in models.items():
    scores=evaluate_model(model,x,y)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name,mean(scores),std(scores)))

>0 0.000 (0.000)


In [304]:
model=get_models()

In [305]:
model

{'0': Pipeline(steps=[('svd', TruncatedSVD(n_components=0)),
                 ('m', LogisticRegression())])}

In [306]:
X,Y=make_classification(n_samples=100,n_features=20,n_informative=15,n_redundant=2)

In [307]:
X.shape

(100, 20)

In [308]:
Y.shape

(100,)

In [309]:
X

array([[ 8.81853579, -0.28572392, -0.42178516, ..., -0.14486222,
        -0.68505116,  3.68365917],
       [ 1.22932605, -1.84099719, -1.55298794, ..., -1.60437784,
        -0.98192084,  1.38139304],
       [-3.14090153,  1.24443666, -0.09382294, ..., -3.35078816,
         1.99088748, -2.10106765],
       ...,
       [-0.31460236, -2.13261527,  0.2114595 , ..., -2.26045933,
         0.51130068,  0.27716685],
       [ 1.71773494, -2.46610677, -2.17136418, ..., -1.67244181,
        -0.57457396,  7.1002087 ],
       [-2.36510027, -1.00273765, -1.32868303, ...,  3.9500971 ,
        -1.81980046, -5.16193311]])

In [310]:
Y

array([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0])

In [311]:
svd=TruncatedSVD()
svd.fit(X)
transformed=svd.transform(X)

In [312]:
transformed

array([[  8.49939835,  -4.36663645],
       [  2.98260835,   7.39545779],
       [ -3.49724919,  -1.88901402],
       [ -5.95343807,   6.89069109],
       [  9.38989109,   2.46187766],
       [ -4.00611912,   8.45930015],
       [  5.21326729,   4.9812641 ],
       [ -0.29869647,  -2.74279199],
       [ -1.53482588,   3.30493154],
       [  3.57473653,   0.35942271],
       [ 21.07219241,   1.53281492],
       [ -1.10376075,   8.09922627],
       [ -3.57539675,   4.61946932],
       [ -1.79180215,  -7.25246003],
       [  3.65933776,   0.50239711],
       [-11.46268813,  -5.90667365],
       [  4.52470352,   9.12710628],
       [ -3.81880215,   2.55766456],
       [  5.10515075, -11.50368939],
       [  7.30747336,  -5.15802717],
       [  9.08771072,   2.56034485],
       [  8.00322318,  -7.08540365],
       [ -2.22290633,   1.27286894],
       [ -1.90181292,  -0.59503075],
       [  0.40841465,   1.75468028],
       [ -0.58339831,   0.06775726],
       [ -1.66150087,  -0.03690586],
 

In [313]:
len(transformed)

100

In [314]:
Y

array([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0])

In [315]:
model=LogisticRegression()
model.fit(transformed,Y)

LogisticRegression()

In [316]:
Y_pred=model.predict(transformed)

In [317]:

model.score(transformed,Y)

0.63

In [318]:
from sklearn.metrics import mean_absolute_error
m_a_e=mean_absolute_error(Y_pred,Y)

In [319]:
m_a_e

0.37



# Performing SVD on wine Quality Dataset

In [320]:
import pandas as pd
df1=pd.read_csv('winequalityN.csv')

In [321]:
df1.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [322]:
df1.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6487.0,6489.0,6494.0,6495.0,6495.0,6497.0,6497.0,6497.0,6488.0,6493.0,6497.0,6497.0
mean,7.216579,0.339691,0.318722,5.444326,0.056042,30.525319,115.744574,0.994697,3.218395,0.531215,10.491801,5.818378
std,1.29675,0.164649,0.145265,4.758125,0.035036,17.7494,56.521855,0.002999,0.160748,0.148814,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


In [323]:
df1['quality'].value_counts()

6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64

# Assingning quality as per the number given

In [324]:
# quality_key={3:'low',4:'low',5:'low',6:'medium',7:'medium',8:'high',9:'high'}

In [325]:
# df1['quality']=df1['quality'].map(quality_key)

In [326]:
df1.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [327]:
df1['type'].value_counts()


white    4898
red      1599
Name: type, dtype: int64

In [328]:
sal=df1['type']
sal=np.array(sal).reshape(-1,1)
sal

array([['white'],
       ['white'],
       ['white'],
       ...,
       ['red'],
       ['red'],
       ['red']], dtype=object)

# One hot encoding the type column

In [329]:
from sklearn.preprocessing import OneHotEncoder
encoding=OneHotEncoder()
OHE_data=encoding.fit_transform(df1.type.values.reshape(-1,1)).toarray()

In [330]:

#OHE_data=np.transpose(OHE_data)
#print("one hot encoded ",OHE_data)
OHE_data



array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [331]:
len(OHE_data)

6497

In [332]:
df1['type']=OHE_data

In [333]:
df1.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,0.0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,0.0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,0.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,0.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [334]:
df1.tail()



Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
6492,1.0,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
6493,1.0,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,1.0,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,1.0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
6496,1.0,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


In [335]:
df1['type'].value_counts()

0.0    4898
1.0    1599
Name: type, dtype: int64

# Missing values handling start¶

In [336]:
df1.type.isna().sum()

0

In [337]:
df1['fixed acidity'].isna().sum()

10

In [338]:
mean=df1['fixed acidity'].mean()
df1['fixed acidity'].fillna(mean,inplace=True)
df1['fixed acidity'].isna().sum()

0

In [339]:
df1['volatile acidity'].isna().sum()

8

In [340]:
mean=df1['volatile acidity'].mean()
df1['volatile acidity'].fillna(mean,inplace=True)
df1['volatile acidity'].isna().sum()

0

In [341]:
df1['citric acid'].isna().sum()

3

In [342]:
mean=df1['citric acid'].mean()
df1['citric acid'].fillna(mean,inplace=True)
df1['citric acid'].isna().sum()

0

In [343]:
df1['residual sugar'].isna().sum()

2

In [344]:
mean=df1['residual sugar'].mean()
df1['residual sugar'].fillna(mean,inplace=True)
df1['residual sugar'].isna().sum()

0

In [345]:
df1['chlorides'].isna().sum()

2

In [346]:
mean=df1['chlorides'].mean()
df1['chlorides'].fillna(mean,inplace=True)
df1['chlorides'].isna().sum()

0

In [347]:
df1['free sulfur dioxide'].isna().sum()

0

In [348]:
df1['total sulfur dioxide'].isna().sum()

0

In [349]:
df1['density'].isna().sum()

0

In [350]:
df1['pH'].isna().sum()

9

In [351]:
mean=df1['pH'].mean()
df1['pH'].fillna(mean,inplace=True)
df1['pH'].isna().sum()

0

In [352]:
df1['sulphates'].isna().sum()

4

In [353]:
mean=df1['sulphates'].mean()
df1['sulphates'].fillna(mean,inplace=True)
df1['sulphates'].isna().sum()

0

In [354]:
df1['alcohol'].isna().sum()

0

In [355]:
df1['quality'].isna().sum()

0

# Handling missing values done

In [356]:
df1.describe()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,0.246114,7.216579,0.339691,0.318722,5.444326,0.056042,30.525319,115.744574,0.994697,3.218395,0.531215,10.491801,5.818378
std,0.430779,1.295751,0.164548,0.145231,4.757392,0.035031,17.7494,56.521855,0.002999,0.160637,0.148768,1.192712,0.873255
min,0.0,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,0.0,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,0.0,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,0.0,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,1.0,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


In [357]:
df1.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,0.0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,0.0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,0.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,0.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [358]:
x=df1.iloc[:500,:-1]
x.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,0.0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,0.0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,0.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,0.0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [359]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
transformed=scalar.fit_transform(x)
len(transformed)
transformed.shape
transformed

array([[ 0.        ,  0.25456989, -0.18339045, ..., -1.43789577,
        -0.35528494, -1.20716534],
       [ 0.        , -0.70848945,  0.10006595, ...,  0.61535743,
         0.03726875, -0.56622501],
       [ 0.        ,  1.76794884, -0.08890499, ...,  0.34159034,
        -0.45342337, -0.01684757],
       ...,
       [ 0.        , -0.02058992, -0.56133233, ...,  0.54691566,
         0.52796087,  2.18066215],
       [ 0.        , -1.53396888,  0.43076509, ...,  0.41003211,
         1.70562195, -0.10841048],
       [ 0.        ,  0.5297297 ,  1.04492063, ..., -0.89036158,
         0.03726875, -1.39029115]])

In [360]:
y=df1.iloc[:500,-1]
y.head()

0    6
1    6
2    6
3    6
4    6
Name: quality, dtype: int64

In [361]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(transformed,y,test_size=0.3)

In [362]:
x_train.shape

(350, 12)

In [363]:
x_train.shape

(350, 12)

In [364]:
y_train.shape

(350,)

In [365]:
x_test.shape

(150, 12)

In [366]:
from sklearn.linear_model import LogisticRegression

In [429]:
model1=LogisticRegression()

In [430]:
model1.fit(x_train,y_train)

LogisticRegression()

In [431]:
train_acc=model1.score(x_train,y_train)
train_acc

0.5857142857142857

In [432]:
test_acc=model1.score(x_test,y_test)
test_acc

0.48

In [434]:
y_pred=model1.predict(x_test)
y_pred

array([5, 5, 6, 7, 5, 5, 7, 6, 5, 5, 7, 6, 6, 6, 6, 7, 6, 5, 5, 5, 5, 6,
       5, 6, 6, 6, 6, 5, 7, 7, 5, 6, 5, 6, 6, 5, 6, 6, 6, 6, 6, 7, 7, 5,
       5, 5, 6, 7, 6, 6, 7, 6, 5, 6, 7, 6, 5, 7, 6, 5, 6, 5, 6, 5, 6, 6,
       5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 6, 5, 5, 5, 5, 5, 5, 7, 6, 6, 6, 6,
       5, 5, 6, 6, 5, 5, 5, 6, 6, 6, 4, 6, 6, 5, 7, 7, 5, 6, 5, 8, 6, 6,
       6, 6, 6, 6, 7, 6, 5, 6, 7, 5, 5, 5, 6, 6, 6, 5, 5, 6, 7, 6, 7, 5,
       6, 5, 6, 6, 5, 5, 6, 5, 6, 6, 6, 5, 5, 6, 6, 7, 5, 5], dtype=int64)

In [435]:
from sklearn.metrics import mean_absolute_error 
print("model's error is ", mean_absolute_error(y_pred,y_test))

model's error is  0.6133333333333333


# Logistic regression done on entire dataset without applying SVD

# Now apply SVD on entire dataset

In [436]:
svd=TruncatedSVD()
svd.fit(x)
transformed1=svd.transform(x)

In [437]:
transformed1

array([[ 1.76978555e+02,  2.08768992e+00],
       [ 1.31810295e+02, -1.94372074e+01],
       [ 1.02338257e+02,  4.90576999e+00],
       [ 1.92443591e+02, -5.68054499e-01],
       [ 1.92443591e+02, -5.68054499e-01],
       [ 1.02338257e+02,  4.90576999e+00],
       [ 1.39866383e+02, -4.68961216e+00],
       [ 1.76978555e+02,  2.08768992e+00],
       [ 1.31810295e+02, -1.94372074e+01],
       [ 1.32534201e+02, -5.23281746e+00],
       [ 6.46977819e+01, -5.37808983e+00],
       [ 1.10577751e+02, -1.07626750e+01],
       [ 7.74254959e+01, -3.48672075e+00],
       [ 1.51016295e+02,  1.06287826e+01],
       [ 1.77963354e+02, -2.39429199e+00],
       [ 1.16092767e+02, -1.01461027e+00],
       [ 1.03903283e+02,  4.17152264e+00],
       [ 8.07343340e+01,  9.03792058e+00],
       [ 1.70313450e+02, -2.62756429e+01],
       [ 1.37989802e+02, -5.72628226e-02],
       [ 8.06925234e+01,  9.05490203e+00],
       [ 1.04246885e+02, -7.16105664e+00],
       [ 1.28923402e+02,  9.11457771e+00],
       [ 1.

In [438]:
len(transformed1)

500

In [439]:
transformed1.shape

(500, 2)

In [440]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1=train_test_split(transformed1,y,test_size=0.3)

In [441]:
x_train1.shape

(350, 2)

In [442]:
x_test1.shape

(150, 2)

In [443]:
y_train1.shape

(350,)

In [444]:
y_test1.shape

(150,)

In [445]:
from sklearn.linear_model import LinearRegression
model1=LogisticRegression()

In [446]:
model1.fit(x_train1,y_train1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [447]:
y_pred1=model1.predict(x_test1)
y_pred1

array([6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6,
       6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       5, 6, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype=int64)

In [448]:
y_test1

187    5
323    6
21     7
40     6
462    5
      ..
197    5
390    7
269    6
267    5
169    5
Name: quality, Length: 150, dtype: int64

In [449]:
print("error",mean_absolute_error(y_pred1,y_test1))

error 0.62


In [450]:
model1.score(x_test1,y_test1)

0.44666666666666666

In [451]:

model1.score(x_train1,y_train1)

0.44285714285714284

# The SVD's Output is not performing well the accuracy was better in applying without SVD¶

# Now apply SVD on different Trauncate values and then check the accuracy

In [452]:
def truncate():
    
    svd=TruncatedSVD(n_components=int(input('enter a number')))
    svd.fit(x)
    transformed2=svd.transform(x)
    return transformed2

In [453]:
transformed2=truncate()
transformed2.shape

enter a number


ValueError: invalid literal for int() with base 10: ''

In [454]:
transformed2

array([[ 0.54382444,  0.19917898,  2.68702115, ..., -0.22967618,
         0.30614704,  0.83849438],
       [-0.43746953, -1.59836903, -0.62861719, ..., -0.08175695,
        -0.5428926 ,  0.5264464 ],
       [-1.07775837,  0.43451735,  0.04663664, ..., -0.04793152,
        -0.07896823, -1.44571025],
       ...,
       [ 0.03709308,  0.42603361, -1.22209808, ..., -0.22409323,
         0.71728112, -0.92905278],
       [ 0.43718839, -2.54951448, -0.71214363, ...,  0.80042912,
         0.46501258,  1.03501511],
       [ 1.0667406 ,  1.78547082,  0.52499686, ...,  2.09912946,
        -1.44104277, -0.62640879]])

In [455]:
scalar3=StandardScaler()
transformed2=scalar3.fit_transform(transformed2)

In [456]:
transformed2

array([[ 0.54382444,  0.19917898,  2.68702115, ..., -0.22967618,
         0.30614704,  0.83849438],
       [-0.43746953, -1.59836903, -0.62861719, ..., -0.08175695,
        -0.5428926 ,  0.5264464 ],
       [-1.07775837,  0.43451735,  0.04663664, ..., -0.04793152,
        -0.07896823, -1.44571025],
       ...,
       [ 0.03709308,  0.42603361, -1.22209808, ..., -0.22409323,
         0.71728112, -0.92905278],
       [ 0.43718839, -2.54951448, -0.71214363, ...,  0.80042912,
         0.46501258,  1.03501511],
       [ 1.0667406 ,  1.78547082,  0.52499686, ...,  2.09912946,
        -1.44104277, -0.62640879]])

In [457]:
x_train3,x_test3,y_train3,y_test3=train_test_split(transformed2,y,test_size=0.3)

In [458]:
x_train3.shape

(350, 10)

In [459]:
x_test3.shape

(150, 10)

In [460]:
y_train3.shape

(350,)

In [461]:
y_test3.shape

(150,)

In [462]:
model3=LogisticRegression()

In [463]:
model3.fit(x_train3,y_train3)

LogisticRegression()

In [464]:
y_pred3=model3.predict(x_test3)

In [465]:
y_pred3

array([6, 7, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6,
       5, 6, 6, 6, 6, 5, 6, 6, 7, 6, 5, 5, 6, 6, 5, 6, 6, 6, 5, 5, 5, 6,
       6, 6, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 4, 6, 6,
       6, 6, 7, 6, 5, 6, 5, 5, 5, 6, 6, 7, 7, 6, 6, 5, 6, 6, 6, 6, 5, 6,
       6, 7, 5, 5, 5, 6, 6, 5, 5, 6, 7, 6, 5, 6, 6, 6, 5, 6, 6, 8, 6, 6,
       5, 5, 7, 6, 7, 6, 6, 6, 5, 7, 6, 6, 6, 5, 6, 7, 5, 6, 5, 6, 5, 6,
       6, 8, 5, 7, 6, 5, 6, 7, 6, 6, 5, 6, 6, 7, 7, 6, 6, 5], dtype=int64)

In [466]:
model3.score(x_train3,y_train3)

0.54

In [467]:
model3.score(x_test3,y_test3)

0.5533333333333333

In [468]:
print("error for model3 ",mean_absolute_error(y_pred3,y_test3))

error for model3  0.49333333333333335


# Analysis for all the trauncate values

In [469]:
#Only logistic without SVD
print("training accuracy",train_acc)
print('testing accuracy',test_acc)

training accuracy 0.5857142857142857
testing accuracy 0.48


In [470]:
# after applying svd on entire dataset
print('training accuracy',model1.score(x_train1,y_train1))
print('testing accuracy',model1.score(x_test1,y_test1))

training accuracy 0.44285714285714284
testing accuracy 0.44666666666666666
