In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv("laptops.csv")

In [3]:
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df[['ram','storage','screen','final_price']]

In [5]:
df

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.00
1,8,256,15.6,299.00
2,8,256,15.6,789.00
3,16,1000,15.6,1199.00
4,16,512,15.6,669.01
...,...,...,...,...
2155,16,1000,17.3,2699.99
2156,16,1000,17.3,2899.99
2157,32,1000,17.3,3399.99
2158,16,1000,13.4,1899.99


df.info()

Question 1 : 

There's one column with missing values. What is it?

In [6]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

Question 2 :


What's the median (50% percentile) for variable 'ram'?

In [7]:
df.ram.median()

np.float64(16.0)

Prepare and split the dataset

Shuffle the dataset (the filtered one you created above), use seed 42.
Split your data in train/val/test sets, with 60%/20%/20% distribution.
Use the same code as in the lectures

Question 3 :

We need to deal with missing values for the column from Q1.
We have two options: fill it with 0 or with the mean of this variable.
Try both options. For each, train a linear regression model without regularization using the code from the lessons.
For computing the mean, use the training only!
Use the validation dataset to evaluate the models and compare the RMSE of each option.
Round the RMSE scores to 2 decimal digits using round(score, 2)
Which option gives better RMSE?

In [8]:
np.random.seed(42)
n = len(df)
n_test = int(0.2 * n)
n_valid = int(0.2 * n)
n_train = n -(n_valid + n_test)

idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]
df_test = df_shuffled[:n_train].copy()
df_val = df_shuffled[n_train : n_train + n_valid].copy()
df_train = df_shuffled[n_train + n_valid :].copy()
# df_train = df.iloc[idx[:n_train]]
# df_val = df.iloc[idx[n_train:n_train+n_val]]
# df_test = df.iloc[idx[n_train+n_val:]]

In [9]:
df_train.head()

Unnamed: 0,ram,storage,screen,final_price
1925,32,1000,15.6,4805.65
612,16,512,13.0,2189.0
443,16,512,15.6,1349.0
1002,16,512,15.0,2175.83
1077,16,1000,15.6,1248.59


In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 432 entries, 1925 to 860
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ram          432 non-null    int64  
 1   storage      432 non-null    int64  
 2   screen       432 non-null    float64
 3   final_price  432 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 16.9 KB


In [11]:
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values


del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

In [12]:
# Step 2: Find rows with any null values
rows_with_nulls = df_train[df_train.isnull().any(axis=1)]

print(rows_with_nulls)

Empty DataFrame
Columns: [ram, storage, screen]
Index: []


In [13]:
def prepare_X(df):
    df = df.copy()
    df = df.fillna(0)
    X = df.values
    return X

In [14]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [15]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [16]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)
print('train:', np.round(rmse(y_train, y_pred)))

X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)
print('validation:', np.round(rmse(y_val, y_pred)))

train: 546.0
validation: 600.0


In [17]:
score = np.round(rmse(y_val, y_pred) , 2 )
score

np.float64(599.84)

In [18]:
def prepare_X(df):
    df = df.copy()
    df = df.fillna(df_train.mean())
    X = df.values
    return X

In [19]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

y_pred = w_0 + X_train.dot(w)
print('train:', np.round(rmse(y_train, y_pred)))

X_val = prepare_X(df_val)
y_pred = w_0 + X_val.dot(w)
print('validation:', np.round(rmse(y_val, y_pred)))

train: 546.0
validation: 602.0


Question 4

Now let's train a regularized linear regression.
For this question, fill the NAs with 0.
Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].

Use RMSE to evaluate the model on the validation dataset.
Round the RMSE scores to 2 decimal digits.
Which r gives the best RMSE?

If there are multiple options, select the smallest r

In [20]:
def prepare_X(df):
    df = df.copy()
    df = df.fillna(0)
    X = df.values
    return X

In [21]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [22]:
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s' %r, np.round(rmse(y_val, y_pred),2))

     0 599.84
  0.01 599.84
   0.1 599.81
     1 599.79
     5 600.24
    10 600.54
   100 601.04


Question 5 :

We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.

Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
For each seed, do the train/validation/test split with 60%/20%/20% distribution.
Fill the missing values with 0 and train a model without regularization.

For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
Round the result to 3 decimal digits (round(std, 3))

What's the value of std?

In [25]:
def prepare_X(df):
    df = df.copy()
    df = df.fillna(0)
    X = df.values
    return X
rms_scores = np.empty(0)

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_valid].copy()
    df_test = df_shuffled.iloc[n_train+n_valid:].copy()


    y_train = df_train.final_price.values
    y_val = df_val.final_price.values
    y_test = df_test.final_price.values


    del df_train['final_price']
    del df_val['final_price']
    del df_test['final_price']

    X_train = prepare_X(df_train)
    w_0, w = train_linear_regression(X_train, y_train)

    print(f" \n The Seed is {seed}")

    X_val = prepare_X(df_val)
    y_pred = w_0 + X_val.dot(w)
    rms = rmse(y_val, y_pred)
    print('validation:', rms)
    rms_scores = np.append(rms_scores,rms)


print(np.round (np.std(rms_scores),3))

 
 The Seed is 0
validation: 565.4520868770983
 
 The Seed is 1
validation: 636.7985423056728
 
 The Seed is 2
validation: 588.9558697907958
 
 The Seed is 3
validation: 597.8148920012524
 
 The Seed is 4
validation: 571.9627915111038
 
 The Seed is 5
validation: 573.2383256618933
 
 The Seed is 6
validation: 647.343832840719
 
 The Seed is 7
validation: 550.4398184485982
 
 The Seed is 8
validation: 587.3335036169883
 
 The Seed is 9
validation: 576.1017929433114
29.176


Question 6 :

Split the dataset like previously, use seed 9.

Combine train and validation datasets.
Fill the missing values with 0 and train a model with r=0.001.
What's the RMSE on the test dataset?

In [30]:
def prepare_X(df):
    df = df.copy()
    df = df.fillna(0)
    X = df.values
    return X


np.random.seed(9)
idx = np.arange(n)
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_valid].copy()
df_test = df_shuffled.iloc[n_train+n_valid:].copy()

y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

df_train = pd.concat([df_train, df_val], ignore_index=True)
y_train = np.concatenate((y_train,y_val))


del df_train['final_price']
del df_test['final_price']

X_train = prepare_X(df_train)
w_0, w = train_linear_regression_reg(X_train, y_train,r=0.001)


X_test = prepare_X(df_test)
y_pred = w_0 + X_test.dot(w)
rms = rmse(y_test, y_pred)
print('RMSE in test data  :', np.round(rms,2))

RMSE in test data  : 608.61
