**The following notebook contains code which answers a series of questions focused on Linear Regression, Model Validation, and Regularization**

The csv file, "winequality_red.csv", used for this assignment contains wine quality information based on physicochemical tests

In [1]:
from sklearn.linear_model import LinearRegression as LR

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

work = pd.read_csv('./data/winequality_red.csv', sep=';')
work.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [24]:
#question 1

In [25]:
wine_features = work.drop(columns = ['quality'])
wine_target = work['quality'].values

wine_features

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [26]:
wine_target

array([5, 5, 5, ..., 6, 5, 6])

In [27]:
from sklearn.preprocessing import StandardScaler as SS

ss = SS()
standard_features = ss.fit_transform(wine_features)

In [28]:
from sklearn.decomposition import PCA

pca = PCA()
pca_work = pca.fit_transform(standard_features)
#pca_work
print(pca.explained_variance_ratio_)

[0.28173931 0.1750827  0.1409585  0.11029387 0.08720837 0.05996439
 0.05307193 0.03845061 0.0313311  0.01648483 0.00541439]


In [29]:
from sklearn.linear_model import LinearRegression as LR
lin_reg = LR()
lin_reg.fit(standard_features,wine_target)

lin_reg.score(standard_features, wine_target)

0.3605517030386882

In [30]:
wine_2 = pca_work[:,:2]
wine_2

array([[-1.61952988,  0.45095009],
       [-0.79916993,  1.85655306],
       [-0.74847909,  0.88203886],
       ...,
       [-1.45612897,  0.31174559],
       [-2.27051793,  0.97979111],
       [-0.42697475, -0.53669021]])

In [31]:
lin_reg_2 = LR()
lin_reg_2.fit(wine_2,wine_target)

lin_reg_2.score(wine_2, wine_target)

0.1617931176126587

In [32]:
#question 2

In [33]:
y_pred = lin_reg.predict(standard_features)

errors = wine_target-y_pred

mse = np.mean(errors**2)
mse

0.416767167221408

In [34]:

y_pred = lin_reg_2.predict(wine_2)

errors = wine_target-y_pred

mse = np.mean(errors**2)
mse

0.5463101701546887

In [None]:
#question 3

In [35]:
from sklearn.decomposition import PCA

pca = PCA()
pca_work = pca.fit_transform(wine_features)
#pca_work
print(pca.explained_variance_ratio_)

[9.46576976e-01 4.83683046e-02 2.58917183e-03 1.51896798e-03
 8.73553990e-04 3.45607218e-05 1.93627614e-05 9.47278079e-06
 8.41376628e-06 1.21472798e-06 4.68762788e-10]


In [36]:
from sklearn.linear_model import LinearRegression as LR
lin_reg = LR()
lin_reg.fit(wine_features,wine_target)

lin_reg.score(wine_features, wine_target)

0.3605517030386882

In [37]:
wine_2 = pca_work[:,:2]
wine_2

array([[-13.22490501,  -2.02389981],
       [ 22.03772361,   4.40832155],
       [  7.16267333,  -2.50146086],
       ...,
       [ -3.43135351,  14.27124349],
       [  1.13287834,  16.31616732],
       [ -3.8743766 ,   3.12634754]])

In [38]:
lin_reg_2 = LR()
lin_reg_2.fit(wine_2,wine_target)

lin_reg_2.score(wine_2, wine_target)

0.04397044200057165

In [None]:
#question 4

In [39]:
from sklearn.manifold import TSNE

In [40]:
tsne = TSNE(random_state = 146)

In [42]:
tsne_work = tsne.fit_transform(standard_features)



In [43]:
lin_reg_3 = LR()
lin_reg_3.fit(tsne_work,wine_target)

lin_reg_3.score(tsne_work, wine_target)

0.22672351247519673