In [58]:
import pandas as pd
import numpy as np

In [59]:
laptop_df = pd.read_csv(r'C:\Users\Виталий\PycharmProjects\ml_zoomcamp_2024\laptops.csv')
laptop_df.columns = laptop_df.columns.str.lower().str.replace(' ', '_')
laptop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   laptop        2160 non-null   object 
 1   status        2160 non-null   object 
 2   brand         2160 non-null   object 
 3   model         2160 non-null   object 
 4   cpu           2160 non-null   object 
 5   ram           2160 non-null   int64  
 6   storage       2160 non-null   int64  
 7   storage_type  2118 non-null   object 
 8   gpu           789 non-null    object 
 9   screen        2156 non-null   float64
 10  touch         2160 non-null   object 
 11  final_price   2160 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 202.6+ KB


In [60]:
laptop_df = laptop_df[['ram', 'storage', 'screen', 'final_price']]
laptop_df.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


## Question 1

In [61]:
df_nan_columns = laptop_df.loc[:, laptop_df.isnull().any()]
print(df_nan_columns.axes[1])

Index(['screen'], dtype='object')


## Question 2

In [62]:
median_ram = laptop_df.ram.median()
print("Median fo RAM: " + str(median_ram))

Median fo RAM: 16.0


## Prepare the dataset

In [63]:
n = len(laptop_df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
print(n_train, n_val, n_test)

1296 432 432


## Shuffle

In [64]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
idx

array([2079,  668, 2073, ..., 1130, 1294,  860])

## Split

In [65]:
train_ds = laptop_df.iloc[idx[:n_train]]
valid_ds = laptop_df.iloc[idx[n_train:n_train + n_val]]
test_ds = laptop_df.iloc[idx[n_train + n_val:]]

In [66]:
train_ds.head()

Unnamed: 0,ram,storage,screen,final_price
2079,32,1000,15.6,1123.29
668,4,64,14.1,201.05
2073,32,1000,14.0,997.74
1113,16,512,13.3,1016.0
788,32,1000,16.0,2739.0


In [67]:
train_ds = train_ds.reset_index(drop=True)
valid_ds = valid_ds.reset_index(drop=True)
test_ds = test_ds.reset_index(drop=True)
train_ds.head()

Unnamed: 0,ram,storage,screen,final_price
0,32,1000,15.6,1123.29
1,4,64,14.1,201.05
2,32,1000,14.0,997.74
3,16,512,13.3,1016.0
4,32,1000,16.0,2739.0


In [68]:
y_train = train_ds.final_price.values
y_val = valid_ds.final_price.values
y_test = test_ds.final_price.values

y_train = np.log1p(y_train)
y_val = np.log1p(y_val)
y_test = np.log1p(y_test)

del train_ds['final_price']
del valid_ds['final_price']
del test_ds['final_price']

## Question 3

In [69]:
def linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [70]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

## Filling by 0

In [71]:
valid_ds_filling_zero = valid_ds.fillna(0)
train_ds_filling_zero = train_ds.fillna(0)

In [72]:
w_0, w = linear_regression(train_ds_filling_zero, y_train)
y_pred_val = w_0 + valid_ds_filling_zero.dot(w)
np.round(rmse(y_val, y_pred_val),2)

0.43

## Filling by mean

In [73]:
mean = train_ds.screen.mean()

valid_ds_filling_mean = valid_ds.fillna(mean)
train_ds_filling_mean = train_ds.fillna(mean)

In [74]:
w_0, w = linear_regression(train_ds_filling_mean, y_train)
y_pred_val = w_0 + valid_ds_filling_mean.dot(w)
np.round(rmse(y_val, y_pred_val),2)

0.43

## Question 4

In [75]:
def regularization_linear_regression(X, y, r):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X) 
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [76]:
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
for r in r_list:
    w_0, w = regularization_linear_regression(train_ds_filling_mean, y_train, r)
    y_pred_val = w_0 + valid_ds_filling_mean.dot(w)
    print(r, np.round(rmse(y_val, y_pred_val),2), w_0)

0 0.43 6.855566544134909
0.01 0.43 6.84671294871298
0.1 0.43 6.768052054759943
1 0.43 6.070945624525375
5 0.46 4.16800800981948
10 0.5 2.998775295987043
100 0.61 0.5168386904731157


## Question 5

In [77]:
seeds_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
n = len(laptop_df)
rmse_list = []
for seed in seeds_list:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    train_ds = laptop_df.iloc[idx[:n_train]]
    valid_ds = laptop_df.iloc[idx[n_train:n_train + n_val]]
    test_ds = laptop_df.iloc[idx[n_train + n_val:]]

    train_ds = train_ds.reset_index(drop=True)
    valid_ds = valid_ds.reset_index(drop=True)
    test_ds = test_ds.reset_index(drop=True)
    
    y_train = train_ds.final_price.values
    y_val = valid_ds.final_price.values
    y_test = test_ds.final_price.values

    # y_train = np.log1p(y_train)
    # y_val = np.log1p(y_val)
    # y_test = np.log1p(y_test)

    del train_ds['final_price']
    del valid_ds['final_price']
    del test_ds['final_price']

    valid_ds_filling_zero = valid_ds.fillna(0)
    train_ds_filling_zero = train_ds.fillna(0)

    w_0, w = linear_regression(train_ds_filling_zero, y_train)
    y_pred_val = w_0 + valid_ds_filling_zero.dot(w)
    rmse_value = np.round(rmse(y_val, y_pred_val),3)
    rmse_list.append(rmse_value)

In [78]:
print(rmse_list)
np.round(np.std(rmse_list), 3)

[565.452, 636.799, 588.956, 597.815, 571.963, 573.238, 647.344, 550.44, 587.334, 576.102]


29.177

## Question 6

In [79]:
n = len(laptop_df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

train_ds = laptop_df.iloc[idx[:n_train]]
valid_ds = laptop_df.iloc[idx[n_train:n_train + n_val]]
test_ds = laptop_df.iloc[idx[n_train + n_val:]]

train_ds = pd.concat([train_ds, valid_ds])

train_ds = train_ds.reset_index(drop=True)
test_ds = test_ds.reset_index(drop=True)

y_train = train_ds.final_price.values
y_test = test_ds.final_price.values

# y_train = np.log1p(y_train)
# y_test = np.log1p(y_test)

del train_ds['final_price']
del test_ds['final_price']

test_ds_filling_zero = test_ds.fillna(0)
train_ds_filling_zero = train_ds.fillna(0)

w_0, w = regularization_linear_regression(train_ds_filling_zero, y_train, 0.001)
y_pred_val = w_0 + test_ds_filling_zero.dot(w)

print(np.round(rmse(y_test, y_pred_val),2))

608.61
