### 2.1 Import libraries

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as srn

## 2.2 Read and explore data

In [43]:
df = pd.read_csv('laptops.csv')
df.head()
columns = list(df.columns)
df.shape
for col in columns:
    print(f'Unique Count [{col}]:', df[col].unique().shape[0], '|', f'Null values: [{col}]', df[col].isnull().sum())
df['Storage type'].value_counts()
df['GPU'].value_counts()
df['Screen'].value_counts()


Unique Count [Laptop]: 2160 | Null values: [Laptop] 0
Unique Count [Status]: 2 | Null values: [Status] 0
Unique Count [Brand]: 27 | Null values: [Brand] 0
Unique Count [Model]: 121 | Null values: [Model] 0
Unique Count [CPU]: 28 | Null values: [CPU] 0
Unique Count [RAM]: 9 | Null values: [RAM] 0
Unique Count [Storage]: 12 | Null values: [Storage] 0
Unique Count [Storage type]: 3 | Null values: [Storage type] 42
Unique Count [GPU]: 45 | Null values: [GPU] 1371
Unique Count [Screen]: 30 | Null values: [Screen] 4
Unique Count [Touch]: 2 | Null values: [Touch] 0
Unique Count [Final Price]: 1440 | Null values: [Final Price] 0


Screen
15.60    1009
14.00     392
16.00     174
17.30     161
13.30     131
16.10      48
17.00      33
13.00      27
15.00      21
13.40      19
13.50      19
11.60      16
14.20      14
12.30      13
14.10      11
13.60      11
16.20      10
15.30       8
10.50       7
12.40       6
14.40       6
15.40       5
12.00       4
18.00       3
14.50       3
13.90       2
12.50       1
10.95       1
10.10       1
Name: count, dtype: int64

In [44]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['laptop', 'status', 'brand', 'model', 'cpu', 'ram', 'storage',
       'storage_type', 'gpu', 'screen', 'touch', 'final_price'],
      dtype='object')

In [45]:
df = df[['ram','storage','screen','final_price']]
df.ram.describe()

count    2160.000000
mean       15.413889
std         9.867815
min         4.000000
25%         8.000000
50%        16.000000
75%        16.000000
max       128.000000
Name: ram, dtype: float64

### Shuffle Dataset

In [75]:
ind = np.arange(df.shape[0])
np.random.seed(9)
np.random.shuffle(ind)
df = df.loc[ind]
shuffled_df = df.reset_index(drop=True)
target = np.log1p(shuffled_df.final_price)
features = shuffled_df.drop('final_price', axis=1)
length = shuffled_df.shape[0]
val_len = test_len = int(0.2 * length)
train_len =  length - val_len - test_len
features
train_df = features.iloc[:train_len]
train_target = target.iloc[:train_len]
val_df = features.iloc[train_len:train_len + val_len]
val_target = target.iloc[train_len:train_len + val_len]
test_df = features.iloc[train_len + val_len:train_len + val_len + test_len]
test_target = target.iloc[train_len + val_len:train_len + val_len + test_len]
train_df = train_df.reset_index(drop=True)
train_target = train_target.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
val_target = val_target.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
test_target = test_target.reset_index(drop=True)

In [76]:
def prepare_using_mean(df):
    df =  df.copy()
    columns = ['ram','storage','screen']
    for col in columns:
        df[col] = df[col].fillna(df[col].mean())
    return df.values


def prepare_using_null(df):
    df =  df.copy()
    df = df.fillna(0)
    return df.values


def train_model(X, y, r=0.1):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones,X])
    XTX = X.T.dot(X)
    XTX = XTX + np.eye(XTX.shape[0]) * r
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]
    

def rmse(y_predict,y_target):
    error_sqr = (y_target - y_predict) ** 2
    mean = error_sqr.mean()
    return np.sqrt(mean)

    
    

In [77]:
X_train1 = prepare_using_mean(train_df)
X_train2 = prepare_using_null(train_df)
w0_1, w_1 = train_model(X_train1, train_target)
w0_2, w_2 = train_model(X_train2, train_target)
X_val1 = prepare_using_mean(val_df)
X_val2 = prepare_using_null(val_df)
val_predict1 = w0_1 + X_val1.dot(w_1)
val_predict2 = w0_2 + X_val2.dot(w_2)
rmse(val_predict1, val_target), rmse(val_predict2, val_target)

(np.float64(0.40821572942762985), np.float64(0.4086976364337416))

### Regularization

In [78]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = prepare_using_null(train_df)
    w0, w = train_model(X_train, train_target,r)
    X_val = prepare_using_null(val_df)
    val_predict = w0 + X_val1.dot(w)
    print(rmse(val_predict, val_target))

0.4076787526157211
0.4077373365059483
0.4083011489068823
0.41667958170305036
0.46846867415246296
0.5179916877129596
0.6544364872359453
