In [1]:
import numpy as np
import pandas as pd

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

--2024-10-08 14:34:01--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 298573 (292K) [text/plain]
Saving to: ‘laptops.csv’


2024-10-08 14:34:02 (10.1 MB/s) - ‘laptops.csv’ saved [298573/298573]



In [3]:
df = pd.read_csv('laptops.csv')

In [5]:
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [7]:
columns = ['ram', 'storage', 'screen', 'final_price']

In [8]:
df[columns].head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [11]:
df[columns].isnull().sum()

Unnamed: 0,0
ram,0
storage,0
screen,4
final_price,0


In [12]:
df['ram'].describe()

Unnamed: 0,ram
count,2160.0
mean,15.413889
std,9.867815
min,4.0
25%,8.0
50%,16.0
75%,16.0
max,128.0


In [17]:
np.random.seed(42)
n = np.arange(len(df[columns]))
np.random.shuffle(n)
n_train = int(len(df[columns]) * 0.6)
n_val = int((len(df[columns]) - n_train) // 2)
n_test = int((len(df[columns]) - n_train) // 2)
print(n_train + n_val + n_test, len(df[columns]))

2160 2160


In [22]:
df_train = df[columns].iloc[n[:n_train]]
df_val = df[columns].iloc[n[n_train:n_train+n_val]]
df_test = df[columns].iloc[n[n_train + n_val:]]
y_train = df_train["final_price"]
y_val = df_val["final_price"]
y_test = df_test["final_price"]

del df_train["final_price"]
del df_val["final_price"]
del df_test["final_price"]

In [19]:
df_train_zeros = df_train.fillna(0)

In [25]:
df_train_mean = df_train["screen"].fillna(df_train["screen"].mean())

In [32]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

def prepare_X_mean(df):
    df = df.copy()
    df["screen"] = df["screen"].fillna(df_train["screen"].mean())
    return df

def prepare_X_zeros(df):
    df = df.copy()
    df["screen"] = df["screen"].fillna(0)
    return df

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

600.27

In [30]:
X_train_zeros = prepare_X_zeros(df_train)
w0, w = train_linear_regression(X_train_zeros, y_train)
X_val_zeros = prepare_X_mean(df_val)
val_zero_pred = w0 + X_val_zeros.dot(w)
round(rmse(y_val, val_zero_pred), 2)

599.53

In [33]:
regularized  = [0, 0.01, 0.1, 1, 5, 10, 100]
for r in regularized:
    X_train = prepare_X_zeros(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r)
    X_val = prepare_X_zeros(df_val)
    train_zero_pred = w0 + X_val.dot(w)
    print(r, round(rmse(y_val, train_zero_pred), 2))

0 597.36
0.01 597.36
0.1 597.35
1 597.21
5 597.01
10 597.06
100 597.9


In [36]:
def split_data(seed = 42):
    np.random.seed(seed)
    n = np.arange(len(df))
    np.random.shuffle(n)

    n_train = int(len(df[columns]) * 0.6)
    n_val = int((len(df[columns]) - n_train) // 2)
    n_test = int((len(df[columns]) - n_train) // 2)

    df_train = df[columns].iloc[n[:n_train]]
    df_val = df[columns].iloc[n[n_train:n_train+n_val]]
    df_test = df[columns].iloc[n[n_train + n_val:]]
    y_train = df_train["final_price"]
    y_val = df_val["final_price"]
    y_test = df_test["final_price"]

    del df_train["final_price"]
    del df_val["final_price"]
    del df_test["final_price"]
    return df_train, df_val, df_test, y_train, y_val, y_test

In [37]:
error = []
for i in range(10):
    df_train, df_val, df_test, Y_train, Y_val, Y_test = split_data(i)
    X_train = prepare_X_zeros(df_train)
    w0, w = train_linear_regression(X_train, Y_train)
    X_val = prepare_X_zeros(df_val)
    val_pred = w0 + X_val.dot(w)
    print(i, rmse(Y_val, val_pred))
    error.append(rmse(Y_val, val_pred))

0 565.4520868771027
1 636.7985423056726
2 588.9558697907962
3 597.8148920012521
4 571.962791511102
5 573.2383256618949
6 647.3438328407208
7 550.4398184485952
8 587.333503616991
9 576.1017929433108


In [38]:
round(np.std(error), 3)

29.176

In [39]:
df_train, df_val, df_test, Y_train, Y_val, Y_test = split_data(9)
r=0.001
df_tr_val = pd.concat([df_train, df_val])
Y_tr_val = pd.concat([Y_train, Y_val])
X_tr_val = df_tr_val.fillna(0)
w0, w = train_linear_regression_reg(X_tr_val, Y_tr_val)
X_test = df_test.fillna(0)
test_pred = w0 + X_test.dot(w)

rmse(Y_test, test_pred)

608.609982204956