### Coursera Machine Learning Excercise 1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import os

**2 Linear regression with one variable**

In [None]:
path = os.getcwd() + '/data/ex1data1.txt'
data = pd.read_csv(path, header=None, names=['Population', 'Profit'])
data.head()

**2.1 Plotting the Data**

In [None]:
X = data[['Population']]
y = data.Profit
m = len(y)

Seaborn provides a nice statistical graphing library, using the regplot function, we can plot out the line of best fit and its data points. 

In [None]:
sns.regplot(data=data, x=X, y=y)

**2.2 Gradient Descent**

In [None]:
# This is also known as the L2, sum of squared error cost
def computeCost(X, y, theta):
    return np.sum(np.power((X*theta.T - y), 2)) / 2 / len(y)

In [None]:
def computeCostVectorized(X,y,theta):
    return ((1./2./len(y)) * (X*theta.T-y).T*(X*theta.T-y)).item(0)

Add the one bias for $x_0$ onto our feature matrix $X$

In [None]:
X.insert(0, 'Bias', 1)

In [None]:
X = np.matrix(X.values)
y = np.matrix(y.values).T

In [None]:
print(X.shape, ',', y.shape)

In [None]:
theta = np.matrix(np.array([0.,0.]))
theta

So our cost for $\theta$ of [0,0] gives us a cost 32.07, see if we can do better than this

In [None]:
print(computeCost(X, y, theta))

In [None]:
print(computeCostVectorized(X, y, theta))

**2.2.4 Gradient descent**

In [None]:
def plotBestLine(X, y, theta):
    regression_x = np.linspace(0,25,500)
    regression_y = [(np.array([1, x])*theta.T).item(0) for x in regression_x]

    plt.scatter(X[:,1], y)
    plt.plot(regression_x, regression_y)

In [None]:
def gradientDescent(X, y, theta, alpha, iters):
    temp = theta.copy()
    cost = np.zeros(iters)
    
    # vectorized version of the update algorithm
    for a in range(iters):
        for i in range(theta.shape[1]):            
            temp[0,i] = theta[0,i] - alpha / (len(X)) * (np.sum(np.multiply((X * theta.T - y), X[:,i])))
        
        theta = temp.copy()
        cost[a] = computeCost(X,y,theta)
    return theta, cost

In [None]:
theta = np.matrix(np.array([0,0]))
theta, cost = gradientDescent2(X, y, theta, 0.0001, 400)

plotBestLine(X,y,theta)

In [None]:
plt.plot(cost)

**3 Linear regression with multiple variables**

In [None]:
path = os.getcwd() + '/data/ex1data2.txt'
data2 = pd.read_csv(path, header=None, names=['Size', 'Bedrooms', 'Price'])
data2.head()

In [None]:
def featureNormalize(X):
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0)

In [None]:
data2 = featureNormalize(data2)

In [None]:
X2 = data2[['Size', 'Bedrooms']]
X2.insert(0, 'Bias', 1)

In [None]:
X2 = np.matrix(X2.values)

In [None]:
y2 = np.matrix(data2[['Price']].values)

In [None]:
theta2 = np.matrix(np.zeros(X2.shape[1]))
print(theta)
theta2, cost2 = gradientDescent2(X2,y2,theta, 0.01, 1000)

In [None]:
computeCost(X2,y2,theta2)

In [None]:
plt.plot(cost2)

**Sklearn Method for Part 2**

In [None]:
from sklearn import linear_model
cls = linear_model.LinearRegression()
cls.fit(X, y)

In [None]:
plt.plot(X[:,1].A1, cls.predict(X).flatten())
plt.scatter(X[:,1],y)