# Elements of Machine Learning, Hoja de Trabajo 4: Regresión Polinomial
## Raul Castellanos 20180052

# Dataset de Kaggle: King Country House Prices
Link to kc_house_data: https://www.kaggle.com/shivachandel/kc-house-data

### Libraries

In [30]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

### Loading the dataset

In [31]:
kc = pd.read_csv("kc_house_data.csv")
kc.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180.0,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170.0,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770.0,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050.0,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680.0,0,1987,0,98074,47.6168,-122.045,1800,7503


### Creating Cost Functions

In [32]:
def linear_cost(X, y, theta): 
    h = X @ theta
    return ((y-h)**2).sum() / (2 * len(X))

In [33]:
def linear_cost_gradient(X, y, theta): 
    h = X @ theta
    return (X.T @ (h-y))/ len(X)

In [34]:
def gradient_descent(
X,y,theta_0,linear_cost,linear_cost_gradient,
    learning_rate = 0.0001, threshold=0.001, max_iter=10000): 
    
    theta = theta_0
    iteration = 0 
    costs = []
    thetas = []
    
    while np.linalg.norm(linear_cost_gradient(X,y,theta)) > threshold and iteration < max_iter:
        iteration += 1 
        theta = theta - (learning_rate * linear_cost_gradient(X,y,theta))
        costs.append(linear_cost(X,y,theta))
        thetas.append(theta.copy())
        
    return theta, costs, thetas

### Splitting X and Y, adding constant column and polinomical varible

In [35]:
kc['sqft_living2'] = kc['sqft_living'] ** 2
kc['cons'] = 1

In [36]:
y2 = kc['price'].to_numpy()
X2 = kc[['cons','sqft_living','sqft_living2']].to_numpy()

In [37]:
print(y2.shape, X2.shape)

(21613,) (21613, 3)


### Creating Tethas Array

In [38]:
m,n = X2.shape

In [39]:
theta_0 = np.random.rand(n,1)
print(theta_0.shape)

(3, 1)


### Shuffle data for Cross-Validation

In [40]:
np.random.shuffle(X2)
np.random.shuffle(y2)

In [41]:
X_train = X2[:12968] #60%
X_cv = X2[12968:17290] #20%
X_test = X2[17290:21613] #20%
print(X_train.shape, X_cv.shape, X_test.shape)

(12968, 3) (4322, 3) (4323, 3)


In [42]:
y_train = y2[:12968] #60%
y_cv = y2[12968:17290] #20%
y_test = y2[17290:21613] #20%
print(y_train.shape, y_cv.shape, y_test.shape)

(12968,) (4322,) (4323,)


### Polynomial Reggresion, 2nd Order

In [None]:
theta2, costs2, thetas2 = gradient_descent(X_train,y_train,theta_0,linear_cost,linear_cost_gradient,0.00000000001,0.0001,50)

In [None]:
plt.plot(costs)

In [None]:
print("El costo minimo (Polinomio Grado 2) es de:",costs[len(costs)-1])

### Polynomial Reggresion, 3rd Order

In [None]:
kc['sqft_living3'] = kc['sqft_living'] ** 3

In [None]:
y3 = kc['price'].to_numpy()
X3 = kc[['cons','sqft_living','sqft_living2','sqft_living3']].to_numpy()

In [None]:
m,n = X3.shape
theta_0 = np.random.rand(n,1)
print(theta_0.shape)

In [None]:
np.random.shuffle(X3)
np.random.shuffle(y3)

In [None]:
X_train = X3[:12968] #60%
X_cv = X3[12968:17290] #20%
X_test = X3[17290:21613] #20%
print(X_train.shape, X_cv.shape, X_test.shape)

In [None]:
y_train = y3[:12968] #60%
y_cv = y3[12968:17290] #20%
y_test = y3[17290:21613] #20%
print(y_train.shape, y_cv.shape, y_test.shape)

In [None]:
theta3, costs3, thetas3 = gradient_descent(X_train,y_train,theta_0,linear_cost,linear_cost_gradient,0.00000000000000001,0.0000001,1000)

In [None]:
plt.plot(costs)

In [None]:
print("El costo minimo (Polinomio Grado 3) es de:",costs[len(costs)-1])

### Polynomial Reggresion, 4th Order

In [None]:
kc['sqft_living4'] = kc['sqft_living'] ** 4

In [None]:
y4 = kc['price'].to_numpy()
X4 = kc[['cons','sqft_living','sqft_living2','sqft_living3','sqft_living4']].to_numpy()

In [None]:
m,n = X4.shape
theta_0 = np.random.rand(n,1)
print(theta_0.shape)

In [None]:
np.random.shuffle(X4)
np.random.shuffle(y4)

In [None]:
X_train = X4[:12968] #60%
X_cv = X4[12968:17290] #20%
X_test = X4[17290:21613] #20%
print(X_train.shape, X_cv.shape, X_test.shape)

In [None]:
y_train = y4[:12968] #60%
y_cv = y4[12968:17290] #20%
y_test = y4[17290:21613] #20%
print(y_train.shape, y_cv.shape, y_test.shape)

In [None]:
theta4, costs4, thetas4 = gradient_descent(X_train,y_train,theta_0,linear_cost,linear_cost_gradient,0.00000000000001,0.0001,1000)

In [None]:
plt.plot(costs)

In [None]:
print("El costo minimo (Polinomio Grado 4) es de:",costs[len(costs)-1])

### Polynomial Reggresion, 5th Order

In [None]:
kc['sqft_living5'] = kc['sqft_living'] ** 5

In [None]:
y5 = kc['price'].to_numpy()
X5 = kc[['cons','sqft_living','sqft_living2','sqft_living3','sqft_living4','sqft_living5']].to_numpy()

In [None]:
m,n = X5.shape
theta_0 = np.random.rand(n,1)
print(theta_0.shape)

In [None]:
np.random.shuffle(X5)
np.random.shuffle(y5)

In [None]:
X_train = X5[:12968] #60%
X_cv = X5[12968:17290] #20%
X_test = X5[17290:21613] #20%
print(X_train.shape, X_cv.shape, X_test.shape)

In [None]:
y_train = y5[:12968] #60%
y_cv = y5[12968:17290] #20%
y_test = y5[17290:21613] #20%
print(y_train.shape, y_cv.shape, y_test.shape)

In [None]:
theta5, costs5, thetas5 = gradient_descent(X_train,y_train,theta_0,linear_cost,linear_cost_gradient,0.000000000000001,0.0001,1000)

In [None]:
plt.plot(costs)

In [None]:
print("El costo minimo (Polinomio Grado 5) es de:",costs[len(costs)-1])