In [7]:
import pandas as pd

data = pd.read_csv('data2.csv')

data.columns = data.columns.str.lower().str.replace(' ','_')

df = data[['ram', 'storage', 'screen', 'final_price']]

df.shape

(2160, 4)

Question 1
There's one column with missing values. What is it?

- 'ram'
- 'storage'
- 'screen' X
- 'final_price'

In [9]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

Question 2
What's the median (50% percentile) for variable 'ram'?

- 8
- 16 X
- 24
- 32

In [10]:
df['ram'].median()

np.float64(16.0)

Question 3
- We need to deal with missing values for the column from Q1.
- We have two options: fill it with 0 or with the mean of this variable.
- Try both options. For each, train a linear regression model without regularization using the code from the lessons.
- For computing the mean, use the training only!
- Use the validation dataset to evaluate the models and compare the RMSE of each option.
- Round the RMSE scores to 2 decimal digits using round(score, 2)
- Which option gives better RMSE?

Options:

- With 0
- With mean
- Both are equally good X

In [48]:
import numpy as np

n = len(df)

n_val = int(n*0.2)
n_test = int(n*0.2)
n_train = n - n_val - n_test

idx = np.arange(n)

np.random.seed(42)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [55]:
X_train = df_train.drop(columns=['final_price'])
y_train = df_train['final_price']

# 597.36 with 0 and X_train['screen'].mean
X_train.fillna(0, inplace=True)


X_val = df_val.drop(columns=['final_price'])
y_val = df_val['final_price']

X_test = df_test.drop(columns=['final_price'])
y_test = df_test['final_price']

X_train.isnull().sum()

ram        0
storage    0
screen     0
dtype: int64

In [56]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

y_pred_val = model.predict(X_val)

rmse_val = np.sqrt(np.mean((y_val - y_pred_val) ** 2)).round(2)

rmse_val


np.float64(597.36)

Question 4
- Now let's train a regularized linear regression.
- For this question, fill the NAs with 0.
- Try different values of r from this list: [0, 0.01, 0.1, 1, 5, 10, 100].
- Use RMSE to evaluate the model on the validation dataset.
- Round the RMSE scores to 2 decimal digits.
- Which r gives the best RMSE?
- If there are multiple options, select the smallest r.

Options:

- 0
- 0.01
- 1
- 10
- 100 X

In [58]:
from sklearn.linear_model import Ridge
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
for r in r_list:
    ridge_model = Ridge(alpha=r)

    ridge_model.fit(X_train, y_train)

    y_pred_ridge_val = ridge_model.predict(X_val)

    rmse_val = np.sqrt(np.mean((y_val - y_pred_ridge_val) ** 2)).round(2)
    
    print(rmse_val, r)

597.36 0
597.36 0.01
597.36 0.1
597.36 1
597.36 5
597.36 10
597.34 100


Question 5
- We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
- Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
- For each seed, do the train/validation/test split with 60%/20%/20% distribution.
- Fill the missing values with 0 and train a model without regularization.
- For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
- What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
- Round the result to 3 decimal digits (round(std, 3))

What's the value of std?

- 19.176
- 29.176 X (33.49)
- 39.176
- 49.176

In [60]:
import numpy as np

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

rmse_list = []

for seed in seeds:
    np.random.seed(seed)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    X_train = df_train.drop(columns=['final_price'])
    y_train = df_train['final_price']

    X_train.fillna(0, inplace=True)


    X_val = df_val.drop(columns=['final_price'])
    y_val = df_val['final_price']

    X_test = df_test.drop(columns=['final_price'])
    y_test = df_test['final_price']

    model = LinearRegression()

    model.fit(X_train, y_train)

    y_pred_val = model.predict(X_val)

    rmse_list.append(np.sqrt(np.mean((y_val - y_pred_val) ** 2)))

print(np.std(rmse_list).round(3))



33.49


Question 6
- Split the dataset like previously, use seed 9.
- Combine train and validation datasets.
- Fill the missing values with 0 and train a model with r=0.001.
- What's the RMSE on the test dataset?

Options:

- 598.60 X (581.23)
- 608.60
- 618.60
- 628.60

In [64]:
np.random.seed(9)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_test = df.iloc[idx[n_train:]]

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

X_train = df_train.drop(columns=['final_price'])
y_train = df_train['final_price']

X_train.fillna(0, inplace=True)

X_test = df_test.drop(columns=['final_price'])
y_test = df_test['final_price']

In [65]:
ridge_model = Ridge(alpha=0.001)

ridge_model.fit(X_train, y_train)

y_pred_ridge_test = ridge_model.predict(X_test)

rmse_val = np.sqrt(np.mean((y_test - y_pred_ridge_test) ** 2)).round(2)

rmse_val

np.float64(581.23)