In [4]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

--2024-10-03 22:32:01--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv
Распознаётся raw.githubusercontent.com (raw.githubusercontent.com)… 2606:50c0:8001::154, 2606:50c0:8003::154, 2606:50c0:8002::154, ...
Подключение к raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8001::154|:443... ^C


In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('laptops.csv')
print(df.shape)
df.head()

(2160, 12)


Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
COLUMNS_TO_USE = [
    'ram',
    'storage',
    'screen',
    'final_price'
]
df_slice = df[COLUMNS_TO_USE]

In [5]:
df_slice.isna().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [6]:
df_slice['ram'].median()

16.0

In [7]:
np.random.seed(42)

n = len(df_slice)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df_slice.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train = df_train.pop('final_price')
y_val = df_val.pop('final_price')
y_test = df_test.pop('final_price')

We need to deal with missing values for the column from Q1.
We have two options: fill it with 0 or with the mean of this variable.
Try both options. For each, train a linear regression model without regularization using the code from the lessons.
For computing the mean, use the training only!
Use the validation dataset to evaluate the models and compare the RMSE of each option.
Round the RMSE scores to 2 decimal digits using round(score, 2)
Which option gives better RMSE?

In [17]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]


def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]


def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [18]:
mean_screen_val = df_train['screen'].mean()
values_to_fillna = [0, mean_screen_val]

for val in values_to_fillna:
    X_train = df_train.fillna({'screen': val}).values
    w_0, w = train_linear_regression(X_train, y_train)

    y_pred = w_0 + X_train.dot(w)
    print('train:', rmse(y_train, y_pred))

    X_val = df_val.fillna({'screen': val}).values
    y_pred = w_0 + X_val.dot(w)
    print('validation:', rmse(y_val, y_pred))

train: 602.3920520606714
validation: 597.3635593619621
train: 602.1844959403247
validation: 600.2650313748422


Now let's train a regularized linear regression.
For this question, fill the NAs with 0.
Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
Use RMSE to evaluate the model on the validation dataset.
Round the RMSE scores to 2 decimal digits.
Which r gives the best RMSE?
If there are multiple options, select the smallest r.

In [24]:
X_train = df_train.fillna({'screen': 0}).values
X_val = df_val.fillna({'screen': 0}).values

for r in [0, 0.01, 0.1, 1, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)

    y_pred = w_0 + X_train.dot(w)
    print('train:', round(rmse(y_train, y_pred), 2))

    y_pred = w_0 + X_val.dot(w)
    print('validation:', round(rmse(y_val, y_pred), 2))

train: 602.39
validation: 597.36
train: 602.39
validation: 597.36
train: 602.39
validation: 597.35
train: 602.41
validation: 597.21
train: 603.13
validation: 597.06
train: 604.99
validation: 597.9


We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
For each seed, do the train/validation/test split with 60%/20%/20% distribution.
Fill the missing values with 0 and train a model without regularization.
For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
Round the result to 3 decimal digits (round(std, 3))

In [27]:
rmse_values_arr = []

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)

    n = len(df_slice)

    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df_slice.iloc[idx]

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()

    y_train = df_train.pop('final_price')
    y_val = df_val.pop('final_price')
    y_test = df_test.pop('final_price')
    
    X_train = df_train.fillna({'screen': 0}).values
    X_val = df_val.fillna({'screen': 0}).values
    
    w_0, w = train_linear_regression(X_train, y_train)
    y_pred = w_0 + X_val.dot(w)
    rmse_values_arr.append(rmse(y_val, y_pred))
    
np.std(rmse_values_arr), rmse_values_arr

(29.176491258292383,
 [565.4520868770995,
  636.7985423056726,
  588.955869790796,
  597.8148920012515,
  571.962791511103,
  573.238325661895,
  647.3438328407194,
  550.4398184485962,
  587.3335036169963,
  576.101792943313])

Split the dataset like previously, use seed 9.
Combine train and validation datasets.
Fill the missing values with 0 and train a model with r=0.001.
What's the RMSE on the test dataset?
Options:

598.60
608.60
618.60
628.60

In [30]:
np.random.seed(9)

n = len(df_slice)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df_slice.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_train = df_train.pop('final_price')
y_val = df_val.pop('final_price')
y_test = df_test.pop('final_price')

X_train = df_train.fillna({'screen': 0}).values
X_val = df_val.fillna({'screen': 0}).values
X_test = df_test.fillna({'screen': 0}).values

X_train_ttl = np.vstack([X_train, X_val])
y_train_ttl = np.hstack([y_train, y_val])

w_0, w = train_linear_regression(X_train_ttl, y_train_ttl)
y_pred = w_0 + X_test.dot(w)

print(round(rmse(y_test, y_pred), 2))

608.61
