## Importing the libraries

In [56]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model, ensemble
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.model_selection import cross_validate, ShuffleSplit, GridSearchCV
from math import sqrt

## Importing the Data

In [57]:
winequality_white_training_df = pd.read_csv('ECEN689-Fall2018/Challenges/4Files/winequality-white-training.csv')
print(winequality_white_training_df.shape)
winequality_white_testing_df = pd.read_csv('ECEN689-Fall2018/Challenges/4Files/winequality-white-testing.csv')
print(winequality_white_testing_df.shape)

(3898, 13)
(1000, 12)


In [58]:
y_train = winequality_white_training_df.quality
x_train = winequality_white_training_df.iloc[:,1:12]
x_test = winequality_white_testing_df.iloc[:,1:12]

## Linear Regression using Gradient Descent. We have bulid a vectorized implementation of Gradient Descent Algorithm 

In [59]:
def linear_regression(X, y, epochs, learning_rate):
    nf = X.shape[1]
    ns = X.shape[0]
    m = 1/ns
    x = np.array(X)
    x = x.T
    y = np.array(y_train)
    y = y.reshape(1, -1)
    w = 2*np.random.random((nf,1))-1
    b = 2*np.random.random((1,1))-1
    for i in range(epochs):
        z = np.dot(w.T, x) + b
        cost = sum([data**2 for data in (y-z)]) / ns
        dz = z-y
        dw = np.dot(x, dz.T)
        dw = (-2)*m*dw
        db = np.sum(dz)
        db = (-2)*m*db
        w = w - (learning_rate*dw)
        b = b - (learning_rate*db)
    return w,b,cost

## Calculating the RMSE using gradient descent model

In [50]:
w,b, cost = linear_regression(x_train, y_train, 100000, 1e-20)
x = np.array(x_train)
x = x.T
z = np.dot(w.T, x) + b
y = np.array(y_train)
y = y.reshape(1,-1)
mse = mean_squared_error(z,y)
rmse = sqrt(mse)
print("RMSE:", rmse)

RMSE: 48.28224362927311


## Linear Regression using ordinary least squares (imported from sklearn library), and calculated the RMSE using this implementation  

In [60]:
model = linear_model.LinearRegression(normalize = True)
model.fit(x_train,y_train)
predicted = model.predict(x_train)
mse = mean_squared_error(predicted, y_train)
rmse = sqrt(mse)
print("RMSE:", rmse)

RMSE: 0.7519290595321102


## The RMSE uisng gradient descent is much higher than the RMSE using ordinary least square(OLS) implementation. So, we will use OLS implementation for our final model. 

In [68]:
# calculating the coefficients and intercept of the model  
coeff= model.coef_
coeff = list(coeff)
coeff.insert(0,model.intercept_)  # insert coeff=0 for Id because we do not want Id as a contributor in our model 
print(coeff)

[144.8751767915973, 0.061925583232178914, -1.8314952326128604, 0.03158697850982829, 0.07873759640554084, -0.48921409378957514, 0.003584869912324266, -0.0003279348016380929, -144.9890055590131, 0.7212388269877488, 0.6240793674834471, 0.1931686923285479]


## Calculating the final predictions using the model

In [None]:
coeff = np.array(coeff)
coeff = coeff.reshape(-1,1)
x = np.array(winequality_white_testing_df.iloc[:,1:12])
result = np.dot(x, model.coef_) + model.intercept_
print(result)

## Exporting model parameters for white wine

In [64]:
df = pd.DataFrame()
n = range(0,12)
df['Id'] = n
df['parameter'] = coeff
df.to_csv('winequality-white-parameters.csv', index=False)

## Exporting model predictions for white wine on test data

In [65]:
df1 = pd.DataFrame()
df1['Id'] = winequality_white_testing_df['Id']
df1['quality'] = result
df1.to_csv('winequality-white-solution.csv', index=False)