In [1]:
import numpy as np
import pandas as pd

In [48]:
class Linear_Regression():
    def __init__(self, alpha = 1e-10 , num_iter = 10000, early_stop = 1e-50, intercept = True, init_weight = None):
        
        
        """
            Some initializations, if neccesary
            
            attributes: 
                        alpha: Learning Rate, default 1e-10
                        num_iter: Number of Iterations to update coefficient with training data
                        early_stop: Constant control early_stop.
                        intercept: Bool, If we are going to fit a intercept, default True.
                        init_weight: Matrix (n x 1), input init_weight for testing.
                        
            
        """
        
        self.model_name = 'Linear Regression'
        
        self.alpha = alpha
        self.num_iter = num_iter
        self.early_stop = early_stop
        self.intercept = intercept
        self.init_weight = init_weight  ### For testing correctness.
        
    
    def fit(self, X_train, y_train):
        """
            Save the datasets in our model, and perform gradient descent.
            
            Parameter:
                X_train: Matrix or 2-D array. Input feature matrix.
                Y_train: Matrix or 2-D array. Input target value.
                
                
        """
        
        self.X = np.mat(X_train)
        self.y = np.mat(y_train).T
        
        ### To be used when data are normalized
        #self.min_target = min(y_train)
        #self.max_target = max(y_train)
        #self.y = min_max_normaliz(self.y)
        
        if self.intercept:
            ones = np.ones((self.X.shape[0], 1))
            self.X = np.hstack((ones, self.X))  # add a column of 1s at the beginning
            
        self.coef = np.random.uniform(-1, 1, size=(self.X.shape[1], 1))  # initialize coef with uniform distribution
        
        self.gradient_descent()  # call the gradient_descent function to train
        
        #self.coef = self.init_weight #### Please change this after you get the example right.
        
    def gradient(self):
        """
            Helper function to calculate the gradient respect to coefficient.
        """
        
        self.grad_coef = np.dot(self.X.T, np.dot(self.X, self.coef) - self.y) / self.X.shape[0]
        
    def gradient_descent(self):
        
        
        """
            Training function
            
        """
        
        self.loss = []
        
        for i in range(self.num_iter):

            self.gradient()
            
            previous_y_hat = self.X.dot(self.coef)
            pre_error = np.mean(np.square(previous_y_hat - self.y))
            
            temp_coef = self.coef - self.alpha * self.grad_coef
            
            current_y_hat = self.X.dot(temp_coef)
            current_error = np.mean(np.square(current_y_hat - self.y))
            
            if current_error <= pre_error:
                self.alpha *= 1.3
                self.coef = temp_coef
            else:
                self.alpha *= 0.9
                
            self.loss.append(current_error)
            
            ### This is the early stop, don't modify following three lines.
            if (abs(pre_error - current_error) < self.early_stop) | (abs(abs(pre_error - current_error) / pre_error) < self.early_stop):
                self.coef = temp_coef
                print(f'Iteration = {i}')
                return self
                
            if i % 10000 == 0:
                print('Iteration: ' +  str(i))
                print('Coef: '+ str(self.coef))
                print('Loss: ' + str(current_error))
                
        return self

    
    def ind_predict(self, x: list):
        """
            Predict the value based on its feature vector x.

            Parameter:
            x: Matrix, array or list. Input feature point.
            
            Return:
                result: prediction of given data point
        """
        
        """    
        x = np.hstack((np.ones((1,1)), np.mat(x))) if self.intercept else np.mat(x)
        result = x.dot(self.coef)
        return result
        """
        """
        result = np.dot(x, self.coef)
        return result
        """
        x = np.mat(x).T
        result = float(x.T * self.coef)
        return result

        
    
    def predict(self, X):
        """
            X is a matrix or 2-D numpy array, represnting testing instances. 
            Each testing instance is a feature vector. 
            
            Parameter:
            X: Matrix, array or list. Input feature point.
            
            Return:
                ret: prediction of given data matrix
        """
        
        ret = []
        X = np.mat(X)

        if self.intercept:
            ones = np.ones((X.shape[0], 1))
            X = np.hstack((ones, X))

        for x in X:
            pred = self.ind_predict(x)
            ### To be used when data are normalized
            #pred =  pred * (self.max_target - self.min_target) + self.min_target
            ret.append(pred)

        return ret
        

In [31]:
def min_max_normaliz(lst):
    """
    Helper function for normalize for faster training.
    """
    maximum = np.max(lst)
    minimum = np.min(lst)

    return (lst - minimum) / (maximum - minimum)

### We generate some easy data for testing. We should fit a line with, $Y = 30 * X + 20$

In [32]:
X = np.array(np.mat(np.arange(1, 1000, 5)).T)
y = np.array((30 * X)).flatten() +  20

#### Do NOT modify the following line, just run it when you are done.  You can also try different initialization, you will notice different coef at the end.

In [5]:
clf = Linear_Regression(alpha = 10, num_iter = 1000000, init_weight= np.mat([15,25]).T)
clf.fit(X,y)

Iteration: 0
Coef: [[ 0.58701559]
 [-0.94003496]]
Loss: 3.5044696913028385e+21
Iteration: 10000
Coef: [[ 0.71613063]
 [30.02903548]]
Loss: 93.38825416199975
Iteration: 20000
Coef: [[ 0.79965782]
 [30.02877068]]
Loss: 92.58090742761162
Iteration: 30000
Coef: [[ 0.88282014]
 [30.02878589]]
Loss: 91.7793963244437
Iteration: 40000
Coef: [[ 0.96565234]
 [30.02866208]]
Loss: 90.98522893288757
Iteration: 50000
Coef: [[ 1.048096  ]
 [30.02840573]]
Loss: 90.19866371152816
Iteration: 60000
Coef: [[ 1.13018272]
 [30.02841018]]
Loss: 89.4207343576167
Iteration: 70000
Coef: [[ 1.21191364]
 [30.02816516]]
Loss: 88.6468132898496
Iteration: 80000
Coef: [[ 1.29329389]
 [30.02817106]]
Loss: 87.88066182391543
Iteration: 90000
Coef: [[ 1.37431832]
 [30.02791727]]
Loss: 87.12018684683369
Iteration: 100000
Coef: [[ 1.45502219]
 [30.02778977]]
Loss: 86.36710981548184
Iteration: 110000
Coef: [[ 1.53534409]
 [30.02779415]]
Loss: 85.62135649815387
Iteration: 120000
Coef: [[ 1.61532372]
 [30.02755272]]
Loss: 84.

KeyboardInterrupt: 

####  As the number of iteration increase, you should notice the coeficient converges to [20, 30]. 
#### It maybe very slow update. Feel free to stop.

In [None]:
clf.coef

In [None]:
np.array(clf.predict(X))

In [None]:
y

#### Please try to normalize the X and fit again with normalized X. You should find something interesting. Also think about what you should do for predicting.

In [33]:
X_norm = min_max_normaliz(X)

In [34]:
clf_norm = Linear_Regression(num_iter = 100000, init_weight= np.mat([15,25]).T)
clf_norm.fit(X_norm, y)

Iteration: 0
Coef: [[-0.90158397]
 [-0.70814329]]
Loss: 3.3279164328676285
Iteration = 1351


In [35]:
clf_norm.coef

matrix([[1.43949362e-23],
        [1.00000000e+00]])

In [36]:
np.array(clf_norm.predict(X_norm))

array([   50.,   200.,   350.,   500.,   650.,   800.,   950.,  1100.,
        1250.,  1400.,  1550.,  1700.,  1850.,  2000.,  2150.,  2300.,
        2450.,  2600.,  2750.,  2900.,  3050.,  3200.,  3350.,  3500.,
        3650.,  3800.,  3950.,  4100.,  4250.,  4400.,  4550.,  4700.,
        4850.,  5000.,  5150.,  5300.,  5450.,  5600.,  5750.,  5900.,
        6050.,  6200.,  6350.,  6500.,  6650.,  6800.,  6950.,  7100.,
        7250.,  7400.,  7550.,  7700.,  7850.,  8000.,  8150.,  8300.,
        8450.,  8600.,  8750.,  8900.,  9050.,  9200.,  9350.,  9500.,
        9650.,  9800.,  9950., 10100., 10250., 10400., 10550., 10700.,
       10850., 11000., 11150., 11300., 11450., 11600., 11750., 11900.,
       12050., 12200., 12350., 12500., 12650., 12800., 12950., 13100.,
       13250., 13400., 13550., 13700., 13850., 14000., 14150., 14300.,
       14450., 14600., 14750., 14900., 15050., 15200., 15350., 15500.,
       15650., 15800., 15950., 16100., 16250., 16400., 16550., 16700.,
      

In [37]:
y

array([   50,   200,   350,   500,   650,   800,   950,  1100,  1250,
        1400,  1550,  1700,  1850,  2000,  2150,  2300,  2450,  2600,
        2750,  2900,  3050,  3200,  3350,  3500,  3650,  3800,  3950,
        4100,  4250,  4400,  4550,  4700,  4850,  5000,  5150,  5300,
        5450,  5600,  5750,  5900,  6050,  6200,  6350,  6500,  6650,
        6800,  6950,  7100,  7250,  7400,  7550,  7700,  7850,  8000,
        8150,  8300,  8450,  8600,  8750,  8900,  9050,  9200,  9350,
        9500,  9650,  9800,  9950, 10100, 10250, 10400, 10550, 10700,
       10850, 11000, 11150, 11300, 11450, 11600, 11750, 11900, 12050,
       12200, 12350, 12500, 12650, 12800, 12950, 13100, 13250, 13400,
       13550, 13700, 13850, 14000, 14150, 14300, 14450, 14600, 14750,
       14900, 15050, 15200, 15350, 15500, 15650, 15800, 15950, 16100,
       16250, 16400, 16550, 16700, 16850, 17000, 17150, 17300, 17450,
       17600, 17750, 17900, 18050, 18200, 18350, 18500, 18650, 18800,
       18950, 19100,

##### You can also try this with the wine dataset we use in HW1. Try fit this function to that dataset with same features. If you look closely to the updates of coefficients. What do you find? This could be mentioned in your report. 

In [38]:
from sklearn.linear_model import LinearRegression

In [39]:
url_Wine = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
wine = pd.read_csv(url_Wine, delimiter=';')
X = wine[['density','alcohol']]
y = wine.quality

In [40]:
lr = LinearRegression()
lr.fit(X,y)
## Squared Error with sklearn.
sum((lr.predict(X) - y)**2)

800.6676988774342

In [41]:
lr.coef_

array([34.82170159,  0.39144139])

In [42]:
lr.intercept_

-33.15237986168703

In [45]:
X

Unnamed: 0,density,alcohol
0,0.99780,9.4
1,0.99680,9.8
2,0.99700,9.8
3,0.99800,9.8
4,0.99780,9.4
...,...,...
1594,0.99490,10.5
1595,0.99512,11.2
1596,0.99574,11.0
1597,0.99547,10.2


#### You will notice different coefficients, but the loss is very close to each other like 805. In your report, briefly discuss this problem.

In [49]:
clf = Linear_Regression(alpha = 1, num_iter = 500000)
clf.fit(X, y)

Iteration: 0
Coef: [[-0.45934637]
 [ 0.59925034]
 [ 0.8261422 ]]
Loss: 121442.11329671436
Iteration: 10000
Coef: [[0.08743834]
 [1.15426921]
 [0.42102826]]
Loss: 0.5078543356240083
Iteration: 20000
Coef: [[0.28883188]
 [1.3615057 ]
 [0.3827437 ]]
Loss: 0.5042234605400849
Iteration: 30000
Coef: [[0.35785727]
 [1.43555273]
 [0.36899584]]
Loss: 0.5037768505123259
Iteration: 40000
Coef: [[0.38055945]
 [1.46298858]
 [0.364308  ]]
Loss: 0.5037217650511678
Iteration: 50000
Coef: [[0.38701903]
 [1.47408254]
 [0.36263455]]
Loss: 0.5037146595696798
Iteration: 60000
Coef: [[0.38778835]
 [1.47944907]
 [0.36204398]]
Loss: 0.5037134306218052
Iteration: 70000
Coef: [[0.38656696]
 [1.48281291]
 [0.36184756]]
Loss: 0.5037129214526338
Iteration: 80000
Coef: [[0.38464677]
 [1.4854731 ]
 [0.36177256]]
Loss: 0.5037124999137044
Iteration: 90000
Coef: [[0.38248321]
 [1.48788826]
 [0.36175461]]
Loss: 0.5037120898900128
Iteration: 100000
Coef: [[0.38023293]
 [1.49021739]
 [0.36174789]]
Loss: 0.5037116813371492

In [50]:
sum((clf.predict(X) - y)**2)

805.408939546096