Initiallizing.....

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
wine_data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/X.csv")
wine_target=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/T.csv")

wine_features=["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]
X=wine_data[wine_features]
Y=wine_target

Define function......

In [3]:
def split_DataFrame(X,N): #Can't handle Series
    df_1=X.iloc[:N,:]
    df_2=X.iloc[N:,:]
    return df_1, df_2

In [4]:
def predict(W0,W1,X):
    X=X.reshape(len(X),1)
    return np.matmul(W1,X)+W0

In [5]:
def Error_numpy(W0,W,X,Y):
    count=0
    N=Y.size
    
    for i in range(N):
        tmp=predict(W0,W,X[i])-Y[i]
        count+=tmp**2
    return np.sqrt(count/N)

In [6]:
def regression_M1(X,Y):
    X_numpy=X.to_numpy()  #for pandas.dataframe to numpy.ndarray
    Y_numpy=Y.to_numpy()
    N=len(X_numpy)

    ones=np.array([[1]]*N) #for intercept
    X_numpy=np.append(X_numpy,ones,1) 
#--------Calculating W-------
    X_trans=np.transpose(X_numpy)
    tmp=np.linalg.pinv(np.matmul(X_trans,X_numpy))
    W=np.matmul(np.matmul(tmp,X_trans),Y_numpy)

    intercept=W[-1]
    W=np.delete(W,-1)
    X_numpy=np.delete(X_numpy,-1,1)
    return (intercept, W)

------------Training and Testing stage-----------

In [7]:
X_train, X_test=split_DataFrame(X,1500)
Y_train, Y_test=split_DataFrame(Y,1500)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)


(1500, 11)
(99, 11)
(1500, 1)
(99, 1)


In [8]:
(W0, W)=regression_M1(X_train,Y_train)
print(W)
print("Training Error for M=1:",Error_numpy(W0, W, X_train.to_numpy(), Y_train.to_numpy()))
print("Test Error for M=1:",Error_numpy(W0, W, X_test.to_numpy(), Y_test.to_numpy()))

[ 2.68702621e-02 -1.12838019e+00 -2.06141685e-01  1.22000584e-02
 -1.77718503e+00  4.29357454e-03 -3.18953315e-03 -1.81795124e+01
 -3.98142390e-01  8.92474793e-01  2.77147239e-01]
Training Error for M=1: [0.64937841]
Test Error for M=1: [0.58611124]


Above are the errors about "continuous estimeted target"  
But! quality is discrete  
let's see the erro of "discrete estimated target"

In [9]:
def discrete_Error_numpy(W0,W,X,Y):
    count=0
    N=Y.size
    
    for i in range(N):
        y_hat=predict(W0,W,X[i])
        y_val=y_hat.item()
        tmp=np.round_(y_val)-Y[i] #rounding estimation into discrete
        count+=tmp**2
    return np.sqrt(count/N)

In [10]:
print("Training Discrete Error for M=1:",discrete_Error_numpy(W0, W, X_train.to_numpy(), Y_train.to_numpy()))
print("Test Discrete Error for M=1:",discrete_Error_numpy(W0, W, X_test.to_numpy(), Y_test.to_numpy()))

Training Discrete Error for M=1: [0.71039895]
Test Discrete Error for M=1: [0.63564173]


This is the error we will occur  
in real quality estimating problem 

For M=2 part we define another function

In [11]:
def regression_M2(X,Y):
    X_numpy=X.to_numpy()
    H=X_numpy
    N, D=X_numpy.shape
    for i in range(D):
        for j in range(D):
            tmp=X_numpy[:,i]*X_numpy[:,j]
            tmp=tmp.reshape(N,1)
            H=np.append(H,tmp,1)
            #print(H.shape)
    ones=np.array([[1]]*N)
    H=np.append(H,ones,1)
    Y_numpy=Y.to_numpy()
    H_trans=np.transpose(H)
    mul=np.matmul(H_trans,H)
    tmp=np.linalg.pinv(mul)
    W=np.matmul(np.matmul(tmp,H_trans),Y_numpy)
    intercept=W[-1]
    W=np.delete(W,-1)
    H=np.delete(H,-1,1)
    return intercept, W
def X_of_M2(X):
  X_numpy=X.to_numpy()
  H=X_numpy
  N, D=X_numpy.shape
  for i in range(D):
      for j in range(D):
          tmp=X_numpy[:,i]*X_numpy[:,j]
          tmp=tmp.reshape(N,1)
          H=np.append(H,tmp,1)
  return H

In [12]:
(W02, W2)=regression_M2(X_train,Y_train)
H_train=X_of_M2(X_train)
H_test=X_of_M2(X_test)

print("Training Error for M=2:",Error_numpy(W02, W2, H_train, Y_train.to_numpy()))
print("Test Error for M=2:",Error_numpy(W02, W2, H_test, Y_test.to_numpy()))


Training Error for M=2: [0.60803652]
Test Error for M=2: [0.69149784]


In [13]:
print("Training Error for M=2:",discrete_Error_numpy(W02, W2, H_train, Y_train.to_numpy()))
print("Test Error for M=2:",discrete_Error_numpy(W02, W2, H_test, Y_test.to_numpy()))


Training Error for M=2: [0.67872429]
Test Error for M=2: [0.77198419]


In M=1 case: Test Error is lower than Training Error   
In M=2 case: Test Error is much larger than Training Error  
I think over-fitting occur in M=2 case  

In [14]:
print("Overall Comparison")
print("--------M=1--------")
print("Training Error for M=1:",Error_numpy(W0, W, X_train.to_numpy(), Y_train.to_numpy()))
print("Test Error for M=1:",Error_numpy(W0, W, X_test.to_numpy(), Y_test.to_numpy()))
print("Training Discrete Error for M=1:",discrete_Error_numpy(W0, W, X_train.to_numpy(), Y_train.to_numpy()))
print("Test Discrete Error for M=1:",discrete_Error_numpy(W0, W, X_test.to_numpy(), Y_test.to_numpy()))
print("------M=2---------")
print("Training Error for M=2:",Error_numpy(W02, W2, H_train, Y_train.to_numpy()))
print("Test Error for M=2:",Error_numpy(W02, W2, H_test, Y_test.to_numpy()))
print("Training Discrete Error for M=2:",discrete_Error_numpy(W02, W2, H_train, Y_train.to_numpy()))
print("Test Discrete Error for M=2:",discrete_Error_numpy(W02, W2, H_test, Y_test.to_numpy()))




Overall Comparison
--------M=1--------
Training Error for M=1: [0.64937841]
Test Error for M=1: [0.58611124]
Training Discrete Error for M=1: [0.71039895]
Test Discrete Error for M=1: [0.63564173]
------M=2---------
Training Error for M=2: [0.60803652]
Test Error for M=2: [0.69149784]
Training Discrete Error for M=2: [0.67872429]
Test Discrete Error for M=2: [0.77198419]


**----------------------------------ABOVE IS HW1 2.1 (a)----------------------------------------**  
**----------------------------------BELOW IS HW1 2.1 (b)----------------------------------------**

If without the most contributive feature in data  
There will be significant error when we training and testing  
So I try to repeat this process for each feature  
and determine which one is the most contributive feature

In [15]:
features=wine_data.columns
for f in features:
  tmp_X_train=X_train.drop(f,1)
  tmp_X_test=X_test.drop(f,1)
  (W0, W)=regression_M1(tmp_X_train,Y_train)
  print("Drop the feature of ",f)
  print("Training Error for M=1:",Error_numpy(W0, W, tmp_X_train.to_numpy(), Y_train.to_numpy()))
  print("Test Error for M=1:",Error_numpy(W0, W, tmp_X_test.to_numpy(), Y_test.to_numpy()))
  print("--------------------------------------------")

Drop the feature of  fixed acidity
Training Error for M=1: [0.64959047]
Test Error for M=1: [0.58593832]
--------------------------------------------
Drop the feature of  volatile acidity
Training Error for M=1: [0.66698363]
Test Error for M=1: [0.57624083]
--------------------------------------------
Drop the feature of  citric acid
Training Error for M=1: [0.64976774]
Test Error for M=1: [0.58508197]
--------------------------------------------
Drop the feature of  residual sugar


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


Training Error for M=1: [0.64951042]
Test Error for M=1: [0.58793132]
--------------------------------------------
Drop the feature of  chlorides
Training Error for M=1: [0.65307769]
Test Error for M=1: [0.59595937]
--------------------------------------------
Drop the feature of  free sulfur dioxide
Training Error for M=1: [0.65015382]
Test Error for M=1: [0.58771637]
--------------------------------------------
Drop the feature of  total sulfur dioxide
Training Error for M=1: [0.65319777]
Test Error for M=1: [0.59468725]
--------------------------------------------
Drop the feature of  density
Training Error for M=1: [0.64952297]
Test Error for M=1: [0.58620034]
--------------------------------------------
Drop the feature of  pH
Training Error for M=1: [0.65026622]
Test Error for M=1: [0.58796814]
--------------------------------------------
Drop the feature of  sulphates
Training Error for M=1: [0.6619623]
Test Error for M=1: [0.60481471]
-------------------------------------------

The Result above shows that no features have deterministic effect on error  
If must choose "alcohol" and "sulphates" both are the strong candidates of most contributive features.  
and "volatile acidity" are less likely to be the candidate of most contributive features.  
So, for further detremination, consider the linear regression process withe only  
"one" feature.


In [16]:
for f in features:
  tmp_X_train=X_train[f]
  tmp_X_train=tmp_X_train.to_frame()
  tmp_X_test=X_test[f]
  tmp_X_test=tmp_X_test.to_frame()

  (W0, W)=regression_M1(tmp_X_train,Y_train)
  print("With the feature of ",f)
  print("Training Error for M=1:",Error_numpy(W0, W, tmp_X_train.to_numpy(), Y_train.to_numpy()))
  print("Test Error for M=1:",Error_numpy(W0, W, tmp_X_test.to_numpy(), Y_test.to_numpy()))
  print("--------------------------------------------")


With the feature of  fixed acidity
Training Error for M=1: [0.8073495]
Test Error for M=1: [0.70015728]
--------------------------------------------
With the feature of  volatile acidity
Training Error for M=1: [0.74774663]
Test Error for M=1: [0.67103462]
--------------------------------------------
With the feature of  citric acid
Training Error for M=1: [0.79311528]
Test Error for M=1: [0.67594117]
--------------------------------------------
With the feature of  residual sugar
Training Error for M=1: [0.81437883]
Test Error for M=1: [0.69012085]
--------------------------------------------
With the feature of  chlorides
Training Error for M=1: [0.80757907]
Test Error for M=1: [0.68594153]
--------------------------------------------
With the feature of  free sulfur dioxide
Training Error for M=1: [0.81338093]
Test Error for M=1: [0.68981913]
--------------------------------------------
With the feature of  total sulfur dioxide
Training Error for M=1: [0.8000929]
Test Error for M=1:

Above are the show case of regression with each feature.  
After observation I found "alcohol" is much more related to quality than others  
So, I assume "alcohol" is the most attributive feature.  
or maybe we can perform cross validation to get more precise result