In [1]:
import numpy as np

import pandas as pd

In [2]:
df = pd.read_csv('laptops.csv')

In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   laptop        2160 non-null   object 
 1   status        2160 non-null   object 
 2   brand         2160 non-null   object 
 3   model         2160 non-null   object 
 4   cpu           2160 non-null   object 
 5   ram           2160 non-null   int64  
 6   storage       2160 non-null   int64  
 7   storage_type  2118 non-null   object 
 8   gpu           789 non-null    object 
 9   screen        2156 non-null   float64
 10  touch         2160 non-null   object 
 11  final_price   2160 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 202.6+ KB


In [5]:
df = df[['ram', 'storage', 'screen', 'final_price']]

In [6]:
df['final_price'].describe()

count    2160.000000
mean     1312.638509
std       911.475417
min       201.050000
25%       661.082500
50%      1031.945000
75%      1708.970000
max      7150.470000
Name: final_price, dtype: float64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ram          2160 non-null   int64  
 1   storage      2160 non-null   int64  
 2   screen       2156 non-null   float64
 3   final_price  2160 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 67.6 KB


In [8]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [9]:
df['ram'].median()

16.0

In [10]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)

n_test = int(0.2 * n)

n_train = n - (n_val + n_test)

idx = np.arange(n)

np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()

df_val = df_shuffled.iloc[n_train: n_train + n_val].copy()

df_test = df_shuffled.iloc[n_train + n_val: ].copy()

y_train = df_train['final_price'].values

y_val = df_val['final_price'].values

In [11]:
def prepare_X(df):

    df_num = df[['ram', 'storage', 'screen']]

    df_num = df_num.fillna(0)

    X = df_num.values

    return X

In [12]:
def train_linear_regression(X, y):

    ones = np.ones(X.shape[0])

    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)

    XTX_inv = np.linalg.inv(XTX)

    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [13]:
X_train = prepare_X(df_train)

w_0, w = train_linear_regression(X_train, y_train)

In [14]:
X_val = prepare_X(df_val)

y_pred = w_0 + X_val.dot(w)

In [15]:
def rmse(y, y_pred):

    error = y_pred - y

    mse = (error ** 2).mean()

    return round(np.sqrt(mse), 2)

In [16]:
rmse(y_val, y_pred)

597.36

In [17]:
mean = df_train['screen'].mean()

In [18]:
def prepare_X2(df):

    df_num = df[['ram', 'storage', 'screen']]

    df_num = df_num.fillna(mean)

    X = df_num.values

    return X

In [19]:
X_train2 = prepare_X2(df_train)

w_0, w = train_linear_regression(X_train2, y_train)

In [20]:
X_val2 = prepare_X2(df_val)

y_pred2 = w_0 + X_val2.dot(w)

In [21]:
rmse(y_val, y_pred2)

600.27

In [22]:
def train_linear_regression_reg(X, y, r = 0.0):

    ones = np.ones(X.shape[0])

    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)

    reg = r * np.eye(XTX.shape[0])

    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)

    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [23]:
 X_train3 = prepare_X(df_train)

 X_val3 = prepare_X(df_val)

In [24]:
r = [0, 0.01, 0.1, 1, 5, 10, 100]

for i in r:

    w_0, w = train_linear_regression_reg(X_train3, y_train, r = i)

    y_pred3 = w_0 + X_val3.dot(w)

    rmse_values = rmse(y_val, y_pred3)

    print(f' for r = {i}, rmse = {rmse_values}')

 for r = 0, rmse = 597.36
 for r = 0.01, rmse = 597.36
 for r = 0.1, rmse = 597.35
 for r = 1, rmse = 597.21
 for r = 5, rmse = 597.01
 for r = 10, rmse = 597.06
 for r = 100, rmse = 597.9


In [25]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [26]:
rmse_list = []

for i in seeds:

    np.random.seed(i)
    
    np.random.shuffle(idx)
    
    df_shuffled = df.iloc[idx]
    
    df_train = df_shuffled.iloc[:n_train].copy()
    
    df_val = df_shuffled.iloc[n_train: n_train + n_val].copy()
    
    df_test = df_shuffled.iloc[n_train + n_val: ].copy()
    
    y_train = df_train['final_price'].values
    
    y_val = df_val['final_price'].values
    
    X_train = prepare_X(df_train)

    w_0, w = train_linear_regression(X_train, y_train)
    
    X_val = prepare_X(df_val)

    y_pred = w_0 + X_val.dot(w)

    rmse_value = rmse(y_val, y_pred)

    rmse_list.append(rmse_value)

In [27]:
rmse_list

[650.09, 610.85, 554.12, 584.52, 582.52, 623.52, 608.45, 563.1, 574.91, 591.61]

In [28]:
import statistics

std_dev = round(statistics.stdev(rmse_list), 3)

In [29]:
std_dev

29.147

In [30]:
np.random.seed(9)

np.random.shuffle(idx)
    
df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()

df_val = df_shuffled.iloc[n_train: n_train + n_val].copy()

df_test = df_shuffled.iloc[n_train + n_val: ].copy()

df_train = pd.concat([df_train, df_val], axis = 0)

y_train = df_train['final_price'].values

y_test = df_test['final_price'].values

X_train = prepare_X(df_train)

w_0, w = train_linear_regression_reg(X_train, y_train, r = 0.001)

X_test = prepare_X(df_test)

y_pred = w_0 + X_test.dot(w)

rmse_value = rmse(y_test, y_pred)

print(rmse_value)

581.04
