In [4]:
import pandas as pd
import numpy as np

###Load the data.

In [6]:
FILEPATH = "laptops.csv"
df = pd.read_csv(FILEPATH)
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


###Preparing the dataset

In [11]:
df = df[['ram','storage','screen','final_price']]
df.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


Question 1
There's one column with missing values. What is it?

- 'ram'
- 'storage'
- *'screen'*
- 'final_price'

Answer: **screen**

In [12]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

Question 2
What's the median (50% percentile) for variable 'ram'?

- 8
- *16*
- 24
- 32

Answer: **16**

In [13]:
median = df['ram'].median()
print(median)

16.0


Shuffle the dataset (the filtered one you created above), use seed 42.

In [14]:
n = np.arange(len(df))
np.random.seed(42)
np.random.shuffle(n) 

Split your data in train/val/test sets, with 60%/20%/20% distribution.

In [15]:
n_val = int(len(df)*0.2)
print(n_val)
n_test = int(len(df)*0.2)
print(n_test)
n_train = len(df) - n_val - n_test
print(n_train)

432
432
1296


###Question 3

We need to deal with missing values for the column from Q1.

We have two options: fill it with 0 or with the mean of this variable.

Try both options.

For each, train a linear regression model without regularization using the code from the lessons.

For computing the mean, use the training only!

Use the validation dataset to evaluate the models and compare the RMSE of each option.

Round the RMSE scores to 2 decimal digits using round(score, 2) Which option gives better RMSE?

In [16]:
#Fill na with 0
df_fillna_zero = df.fillna(0)
print(df_fillna_zero)

      ram  storage  screen  final_price
0       8      512    15.6      1009.00
1       8      256    15.6       299.00
2       8      256    15.6       789.00
3      16     1000    15.6      1199.00
4      16      512    15.6       669.01
...   ...      ...     ...          ...
2155   16     1000    17.3      2699.99
2156   16     1000    17.3      2899.99
2157   32     1000    17.3      3399.99
2158   16     1000    13.4      1899.99
2159   16      256    13.4      1699.99

[2160 rows x 4 columns]


In [17]:
df_fillna_mean = df.fillna(df['screen'].mean())
print(df_fillna_mean)

      ram  storage  screen  final_price
0       8      512    15.6      1009.00
1       8      256    15.6       299.00
2       8      256    15.6       789.00
3      16     1000    15.6      1199.00
4      16      512    15.6       669.01
...   ...      ...     ...          ...
2155   16     1000    17.3      2699.99
2156   16     1000    17.3      2899.99
2157   32     1000    17.3      3399.99
2158   16     1000    13.4      1899.99
2159   16      256    13.4      1699.99

[2160 rows x 4 columns]


Dataframe when fillna with zero

In [18]:
df_train_fz = df_fillna_zero.iloc[n[:n_train]]
df_val_fz = df_fillna_zero.iloc[n[n_train:n_train+n_val]]
df_test_fz = df_fillna_zero.iloc[n[n_train+n_val:]]

In [19]:
X_train_fz = df_train_fz.drop('final_price', axis=1).values
X_val_fz = df_val_fz.drop('final_price', axis=1).values
X_test_fz = df_test_fz.drop('final_price', axis=1).values

In [20]:
y_train_fz = df_train_fz['final_price'].values
y_val_fz = df_val_fz['final_price'].values
y_test_fz = df_test_fz['final_price'].values

Dataframe when fillna with "screen" mean

In [21]:
df_train_fm = df_fillna_zero.iloc[n[:n_train]]
df_val_fm = df_fillna_zero.iloc[n[n_train:n_train+n_val]]
df_test_fm = df_fillna_zero.iloc[n[n_train+n_val:]]

In [22]:
X_train_fm = df_train_fm.drop('final_price', axis=1).values
X_val_fm = df_val_fm.drop('final_price', axis=1).values
X_test_fm = df_test_fm.drop('final_price', axis=1).values

In [23]:
y_train_fm = df_train_fm['final_price'].values
y_val_fm = df_val_fm['final_price'].values
y_test_fm = df_test_fm['final_price'].values

Linear Regression When fillna is zero

In [24]:

# Features
X = X_train_fz

# Target
y = y_train_fz

In [25]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)


    return w[0], w[1:] 

In [26]:
w0, w = train_linear_regression(X, y)
print(w0)
print(w)

676.8954853003862
[ 43.70681555   0.86506867 -36.10399833]


Validation when fillna is zero

In [27]:
def rmse(y_val, y_pred):
    error = y_val - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [28]:
X_val = X_val_fz
y_val = y_val_fz
y_pred = w0 + X_val.dot(w)

In [29]:
rmse_fz = rmse(y_val, y_pred)
print(rmse_fz)

597.3635593619622


Linear Regression When fillna is mean

In [30]:
# Features
X = X_train_fm

# Target
y = y_train_fm

In [31]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [32]:
w0, w = train_linear_regression(X, y)
print(w0)
print(w)

676.8954853003862
[ 43.70681555   0.86506867 -36.10399833]


Validation when fillna is mean

In [33]:
def rmse(y, y_pred):
    error = y - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [34]:
X_val = X_val_fm
y_val = y_val_fm
y_pred = w0 + X_val.dot(w)

In [35]:
rmse_fm = rmse(y_val, y_pred)
print(rmse_fm)

597.3635593619622


In [36]:
print(rmse_fz, rmse_fm)

597.3635593619622 597.3635593619622


Answer: **Both are equally good**

###Question 4

For this question, fill the NAs with 0.

Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].

Use RMSE to evaluate the model on the validation dataset.

Round the RMSE scores to 2 decimal digits.

Which r gives the best RMSE?

In [37]:
df = df.fillna(0)
X_train = df.drop('final_price', axis=1).values
X_val = df.drop('final_price', axis=1).values
X_test = df.drop('final_price', axis=1).values

y_train = df['final_price'].values
y_val = df['final_price'].values
y_test = df['final_price'].values

In [38]:
def train_linear_regression_regu(X, y, r):
    ones = np.ones(X.shape[0]) 
    X = np.column_stack([ones, X]) 

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0]) 

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y) 

    return w[0], w[1:]

In [39]:
def rmse(y, y_pred):
    error = y - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [40]:
reg = [0, 0.01, 0.1, 1, 5, 10, 100]
for r in reg:

    w0, w = train_linear_regression_regu(X_train, y_train, r)


    y_pred = w0 + X_val.dot(w)  #

    error = rmse(y_val, y_pred)

    print(f"r = {r}")
    print(f"w0 = {w0:.2f}")
    print(f"RMSE = {error:.2f}")

r = 0
w0 = 615.51
RMSE = 590.47
r = 0.01
w0 = 615.14
RMSE = 590.47
r = 0.1
w0 = 611.82
RMSE = 590.47
r = 1
w0 = 580.46
RMSE = 590.48
r = 5
w0 = 472.80
RMSE = 590.60
r = 10
w0 = 383.85
RMSE = 590.82
r = 100
w0 = 87.82
RMSE = 592.28


###Question 5

We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.

- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].

- For each seed, do the train/validation/test split with 60%/20%/20% distribution.

- Fill the missing values with 0 and train a model without regularization.

- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.

- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.

- Round the result to 3 decimal digits (round(std, 3))

In [41]:
def split_data(df, seed):
    n = np.arange(len(df))
    np.random.seed(seed)
    np.random.shuffle(n)

    n_val = int(len(df) * 0.2)
    n_test = int(len(df) * 0.2)
    n_train = len(df) - n_val - n_test

    train_idx = n[:n_train]
    val_idx = n[n_train:n_train + n_val]
    test_idx = n[n_train + n_val:]

    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]
    test_df = df.iloc[test_idx]

    return train_df, val_df, test_df

In [42]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmse_scores = []

for seed in seeds:
    
    train_df, val_df, test_df = split_data(df, seed)

    train_df = train_df.fillna(0)
    val_df = val_df.fillna(0)
    test_df = test_df.fillna(0)

    X_train = train_df.drop('final_price', axis=1).values
    X_val = val_df.drop('final_price', axis=1).values
    X_test = test_df.drop('final_price', axis=1).values

    y_train = train_df['final_price'].values
    y_val = val_df['final_price'].values
    y_test = test_df['final_price'].values

    w0, w = train_linear_regression_regu(X_train, y_train, r=0)

    y_pred_val = w0 + X_val.dot(w)

    val_rmse = rmse(y_val, y_pred_val)

    rmse_scores.append(val_rmse)

    print(f"{seed}, RMSE: {val_rmse:.2f}")

0, RMSE: 565.45
1, RMSE: 636.80
2, RMSE: 588.96
3, RMSE: 597.81
4, RMSE: 571.96
5, RMSE: 573.24
6, RMSE: 647.34
7, RMSE: 550.44
8, RMSE: 587.33
9, RMSE: 576.10


In [46]:
np.std(rmse_scores).round(3)

np.float64(29.176)

###Question 6

- Split the dataset like previously, use seed 9.

- Combine train and validation datasets.

- Fill the missing values with 0 and train a model with r=0.001.

- What's the RMSE on the test dataset?

In [47]:
df2 = pd.read_csv(FILEPATH)

# Make column names consistent
df2.columns = df2.columns.str.lower().str.replace(' ', '_')

df2 = df2[['ram','storage','screen','final_price']]
df2.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [48]:
n = np.arange(len(df2))
np.random.seed(9)
np.random.shuffle(n)

n_val = int(len(df2)*0.2)
n_test = int(len(df2)*0.2)
n_train = len(df2) - n_test

test_idx = n[:n_test]
train_idx = n[n_test:]

df2 = df2.fillna(0)
train_df = df2.iloc[train_idx]
test_df = df2.iloc[test_idx]

In [49]:
test_df.head()

Unnamed: 0,ram,storage,screen,final_price
2003,32,1000,15.6,1592.89
502,8,512,13.3,1349.0
896,8,512,15.6,549.0
356,32,1000,17.3,2999.0
499,16,1000,16.0,1889.0


In [50]:
X_train = train_df.drop('final_price', axis=1).values
X_test = test_df.drop('final_price', axis=1).values

y_train = train_df['final_price'].values
y_test = test_df['final_price'].values

w0, w = train_linear_regression_regu(X_train, y_train, r=0.001)

y_pred = w0 + X_test.dot(w)

test_rmse = rmse(y_test, y_pred)

print(f"Test RMSE: {test_rmse:.2f}")

Test RMSE: 535.08
