# Homework 2

In [1]:
# import
import numpy as np
import pandas as pd

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
# getting data
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)
len(df)

2160

In [3]:
# normalize the name
df.columns = df.columns.str.lower().str.replace(' ', '_')

# get only neccessary cols
df = df[['ram', 'storage', 'screen', 'final_price']]
df.head(10)

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01
5,32,1000,17.3,1699.0
6,8,256,14.0,909.0
7,8,512,15.6,809.01
8,8,256,15.6,519.0
9,16,512,16.1,1149.0


In [4]:
# Q1: check for null value
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [5]:
# Q2: 'ram' median
float(df.ram.median())

16.0

In [6]:
# prepare and split the dataset
np.random.seed(42)

n = len(df)
n_val = n_test = int(0.2 * n)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.shuffle(idx)

df_random = df.iloc[idx]

df_train = df_random.iloc[:n_train].copy()
df_val = df_random.iloc[n_train:n_train+n_val].copy()
df_test = df_random.iloc[n_train+n_val:].copy()

# the final_price variables actually has long tail so ill need to deal with it

y_train = np.log1p(df_train.final_price.values)
y_val = np.log1p(df_val.final_price.values)
y_test = np.log1p(df_test.final_price.values)

y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

for i in (df_train, df_val, df_test):
    del i['final_price']

## Q3: train linear regression models with two different filling na options

### Prerequisite

In [7]:
# define linear regression model
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

# other required functions wrapper
def prepare_X(df, base, fill_val):
    df_feature = df[base]
    df_feature = df_feature.fillna(fill_val) # only one feature has NAs -> still valid
    X = df_feature.values
    return X
    
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

def get_pred(w_0, X, w):
    return w_0 + X.dot(w)
    
# get base
base = df_train.columns.values
print(base)

['ram' 'storage' 'screen']


### Fill with 0

In [8]:
# train model
X_train_0 = prepare_X(df_train, base, 0)
w_0, w = train_linear_regression(X_train_0, y_train)

# validate model
X_val_0 = prepare_X(df_val, base, 0)
y_pred = get_pred(w_0, X_val_0, w)
print(round(rmse(y_val, y_pred), 2))

597.36


### Fill with mean

In [9]:
# train model
X_train_mean = prepare_X(df_train, base, df_train.screen.mean())
w_0, w = train_linear_regression(X_train_mean, y_train)

# validate model
X_val_mean = prepare_X(df_val, base, df_val.screen.mean())
y_pred = get_pred(w_0, X_val_mean, w)
print(round(rmse(y_val, y_pred), 2))

600.26


## Q4: train a regularized linear regression

In [10]:
# define linear regression model with regularization
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]

X_train = prepare_X(df_train, base, 0)
X_val = prepare_X(df_val, base, 0)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s %.2f' %(r, rmse(y_val, y_pred)))

     0 597.36
  0.01 597.36
   0.1 597.35
     1 597.21
     5 597.01
    10 597.06
   100 597.90


## Q5: the standard deviation of all the RMSE

In [27]:
std_rmse = []
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)
    idx = np.arange(n) # put idx here to reset it to default idx every itteration
    np.random.shuffle(idx)

    # prepare dataset
    df_random = df.iloc[idx]

    df_train = df_random.iloc[:n_train].copy()
    df_val = df_random.iloc[n_train:n_train+n_val].copy()
    df_test = df_random.iloc[n_train+n_val:].copy()

    # obtain y target
    y_train = df_train.final_price.values
    y_val = df_val.final_price.values
    y_test = df_test.final_price.values

    for i in (df_train, df_val, df_test):
        del i['final_price']

    # train model
    X_train = prepare_X(df_train, base, 0)
    w_0, w = train_linear_regression(X_train, y_train)
    
    # validate model
    X_val = prepare_X(df_val, base, 0)
    y_pred = get_pred(w_0, X_val, w)
    std_rmse.append(rmse(y_val, y_pred))

print(round(np.std(std_rmse), 3))

29.176


In [37]:
np.random.seed(9)

idx = np.arange(n)
np.random.shuffle(idx)

# prepare dataset
df_random = df.iloc[idx]

df_train = df_random.iloc[:n_train].copy()
df_val = df_random.iloc[n_train:n_train+n_val].copy()
df_test = df_random.iloc[n_train+n_val:].copy()

# obtain y target
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

for i in (df_train, df_val, df_test):
    del i['final_price']

# train model
df_combine = pd.concat([df_train, df_val])
y_combine = np.append(y_train, y_val)
X = prepare_X(df_combine, base, 0)
w_0, w = train_linear_regression_reg(X, y_combine, r=0.001)

# validate model
X_test = prepare_X(df_test, base, 0)
y_pred = get_pred(w_0, X_test, w)
print(round(rmse(y_test, y_pred), 2))

608.61
