### Imports

In [305]:
import numpy as np
import pandas as pd

In [306]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

--2024-10-09 23:46:38--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 298573 (292K) [text/plain]
Saving to: ‘laptops.csv.1’


2024-10-09 23:46:38 (5,68 MB/s) - ‘laptops.csv.1’ saved [298573/298573]



### Preparing the dataset 

First, we'll normalize the names of the columns:

In [307]:
df = pd.read_csv("laptops.csv")

In [308]:
df.columns

Index(['Laptop', 'Status', 'Brand', 'Model', 'CPU', 'RAM', 'Storage',
       'Storage type', 'GPU', 'Screen', 'Touch', 'Final Price'],
      dtype='object')

In [309]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [310]:
df.columns

Index(['laptop', 'status', 'brand', 'model', 'cpu', 'ram', 'storage',
       'storage_type', 'gpu', 'screen', 'touch', 'final_price'],
      dtype='object')

In [311]:
df.head(5)

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


### EDA

* Look at the `final_price` variable. Does it have a long tail? 


### Question 1

There's one column with missing values. What is it?


* `'ram'`
* `'storage'`
* `'screen'`
* `'final_price'`


In [312]:
df.isnull().sum()

laptop             0
status             0
brand              0
model              0
cpu                0
ram                0
storage            0
storage_type      42
gpu             1371
screen             4
touch              0
final_price        0
dtype: int64


* `'ram'`
* `'storage'`
* **`'screen'`**
* `'final_price'`


### Question 2

What's the median (50% percentile) for variable `'ram'`?

- 8
- 16
- 24
- 32

In [313]:
df.describe()

Unnamed: 0,ram,storage,screen,final_price
count,2160.0,2160.0,2156.0,2160.0
mean,15.413889,596.294444,15.168112,1312.638509
std,9.867815,361.220506,1.203329,911.475417
min,4.0,0.0,10.1,201.05
25%,8.0,256.0,14.0,661.0825
50%,16.0,512.0,15.6,1031.945
75%,16.0,1000.0,15.6,1708.97
max,128.0,4000.0,18.0,7150.47


In [314]:
df[["ram"]].describe()

Unnamed: 0,ram
count,2160.0
mean,15.413889
std,9.867815
min,4.0
25%,8.0
50%,16.0
75%,16.0
max,128.0


- 8
- **16**
- 24
- 32

### Prepare and split the dataset

* Shuffle the dataset (the filtered one you created above), use seed `42`.
* Split your data in train/val/test sets, with 60%/20%/20% distribution.

Use the same code as in the lectures

In [315]:
def split_train_val_test(df, test_size=0.2, val_size=0.2, seed=42):
    # Sizes
    n = len(df)
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test
    print(f"total: {n}, train: {n_train} ({n_train/n}%), val: {n_val} ({n_val/n}%), test: {n_test} ({n_test/n}%) seed: {seed}" )
    
    # Indixes
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    # Partition
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    
    # Reset index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    # Return
    return df_train, df_val, df_test

In [316]:
df_train, df_val, df_test = split_train_val_test(df, test_size=0.2, val_size=0.2, seed=42)

total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 42


In [317]:
df_val.shape, df_test.shape

((432, 12), (432, 12))

In [318]:
#y_train = np.log1p(df_train.final_price.values)
#y_val = np.log1p(df_val.final_price.values)
#y_test = np.log1p(df_test.final_price.values)
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

### Question 3


* We need to deal with missing values for the column from Q1.
* We have two options: fill it with 0 or with the mean of this variable.
* Try both options. For each, train a linear regression model without regularization using the code from the lessons.
* For computing the mean, use the training only!
* Use the validation dataset to evaluate the models and compare the RMSE of each option.
* Round the RMSE scores to 2 decimal digits using `round(score, 2)`
* Which option gives better RMSE?

Options:

- With 0
- With mean
- Both are equally good

In [319]:
nan_cols = [i for i in df.columns if df[i].isnull().any()]
nan_cols

['storage_type', 'gpu', 'screen']

In [320]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [321]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [322]:
BASE = ['ram', 'storage', 'screen']

#### With 0

In [323]:
def prepare_X(df):
    df_num = df[BASE]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [324]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)

round(score, 2)

597.36

In [325]:
#zero_columns = np.any(X_train == 0, axis=0)
#zero_columns

#### Mean

In [326]:
def prepare_X_with_mean(df):
    df_num = df[BASE]
    df_num = df_num.apply(lambda x: x.fillna(x.mean()))
    X = df_num.values
    return X

In [327]:
X_train = prepare_X_with_mean(df_train)
w0, w = train_linear_regression_reg(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)

round(score, 2)

597.74

In [328]:
#zero_columns = np.any(X_train == df.screen.mean(), axis=0)
#df_train.isnull().sum()

Options:

- With 0
- With mean
- **Both are equally good**

### Question 4



* Now let's train a regularized linear regression.
* For this question, fill the NAs with 0. 
* Try different values of `r` from this list: `[0, 0.01, 0.1, 1, 5, 10, 100]`.
* Use RMSE to evaluate the model on the validation dataset.
* Round the RMSE scores to 2 decimal digits.
* Which `r` gives the best RMSE?

If there are multiple options, select the smallest `r`.

Options:

- 0
- 0.01
- 1
- 10
- 100

In [329]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(
        XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [330]:
rs  = [0, 0.01, 0.1, 1, 5, 10, 100]

for r in rs:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    #print(f"r={r}, w0, rmse={round(score, 2)}")
    print(r, w0, score, round(score, 2))

0 676.8954853003995 597.3635593619622 597.36
0.01 676.2736817205587 597.3616819856013 597.36
0.1 670.7284836314063 597.345159296362 597.35
1 619.9030834108207 597.2121215589519 597.21
5 463.7771697142356 597.0111186297033 597.01
10 352.79713367679835 597.0587680661115 597.06
100 66.92071440181994 597.9032640603043 597.9


In [338]:
r = rs[0]
print(f"r={r}")
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
score

r=0


576.1017929433108

- **0**
- 0.01
- 1
- 10
- 100

### Question 5 


* We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
* Try different seed values: `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]`.
* For each seed, do the train/validation/test split with 60%/20%/20% distribution.
* Fill the missing values with 0 and train a model without regularization.
* For each seed, evaluate the model on the validation dataset and collect the RMSE scores. 
* What's the standard deviation of all the scores? To compute the standard deviation, use `np.std`.
* Round the result to 3 decimal digits (`round(std, 3)`)

What's the value of std?

- 19.176
- 29.176
- 39.176
- 49.176

> Note: Standard deviation shows how different the values are.
> If it's low, then all values are approximately the same.
> If it's high, the values are different. 
> If standard deviation of scores is low, then our model is *stable*.

In [332]:
seed_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
results = []
for seed in seed_list:
    df_train, df_val, df_test = split_train_val_test(df, test_size=0.2, val_size=0.2, seed=seed)
    #y_train = np.log1p(df_train.final_price.values)
    #y_val = np.log1p(df_val.final_price.values)
    #y_test = np.log1p(df_test.final_price.values)
    y_train = df_train.final_price.values
    y_val = df_val.final_price.values
    y_test = df_test.final_price.values

    X_train = prepare_X(df_train)
    w0, w = train_linear_regression(X_train, y_train)
    
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    #scores = round(score, 2)
    #print(f"y_pred: {np.expm1(np.std(y_pred))} score: {score}")
    results.append(score)

total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 0
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 1
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 2
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 3
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 4
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 5
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 6
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 7
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 8
total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 9


In [333]:
results

[565.4520868771027,
 636.7985423056726,
 588.9558697907962,
 597.8148920012521,
 571.962791511102,
 573.2383256618949,
 647.3438328407208,
 550.4398184485952,
 587.333503616991,
 576.1017929433108]

In [334]:
res = np.std(np.array(results))
round(res, 3)

29.176


- 19.176
- **29.176**
- 39.176
- 49.176

### Question 6


* Split the dataset like previously, use seed 9.
* Combine train and validation datasets.
* Fill the missing values with 0 and train a model with `r=0.001`. 
* What's the RMSE on the test dataset?

Options:

- 598.60
- 608.60
- 618.60
- 628.60

In [335]:
seed = 9
r=0.001
df_train, df_val, df_test = split_train_val_test(df, test_size=0.2, val_size=0.2, seed=seed)
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)

#y_train = np.log1p(df_train.final_price.values)
#y_val = np.log1p(df_val.final_price.values)
#y_test = np.log1p(df_test.final_price.values)
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values
y_full_train = np.concatenate([y_train, y_val])


total: 2160, train: 1296 (0.6%), val: 432 (0.2%), test: 432 (0.2%) seed: 9


In [336]:
X_full_train = prepare_X(df_full_train)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=r)

In [337]:
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
print(score)

608.609982204956


Options:

- 598.60
- **608.60**
- 618.60
- 628.60