In [38]:
import pandas as pd
import numpy as np

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [40]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
df = pd.read_csv(url)

In [41]:
df

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.00
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.00
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.00
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.00
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01
...,...,...,...,...,...,...,...,...,...,...,...,...
2155,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3060,17.3,No,2699.99
2156,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,16,1000,SSD,RTX 3070,17.3,No,2899.99
2157,Razer Blade 17 FHD 360Hz Intel Core i7-11800H/...,Refurbished,Razer,Blade,Intel Core i7,32,1000,SSD,RTX 3080,17.3,No,3399.99
2158,Razer Book 13 Intel Evo Core i7-1165G7/16GB/1T...,Refurbished,Razer,Book,Intel Evo Core i7,16,1000,SSD,,13.4,Yes,1899.99


In [42]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [43]:
columns = ['ram', 'storage', 'screen', 'final_price']
df = df[columns]

### Q1

In [44]:
missing_values = df.isnull().sum()
print(missing_values)

ram            0
storage        0
screen         4
final_price    0
dtype: int64


### Q2

In [45]:
median_ram = df['ram'].median()
print(median_ram)

16.0


In [46]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [47]:
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)

In [48]:
print(f'Training set size: {len(df_train)}')
print(f'Validation set size: {len(df_val)}')
print(f'Test set size: {len(df_test)}')

Training set size: 1296
Validation set size: 432
Test set size: 432


In [49]:
def prepare_X_y(df):
    X = df[['ram', 'storage', 'screen']].fillna(0)  # Placeholder, we'll change later
    y = df['final_price']
    return X, y

In [50]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [51]:
model = LinearRegression()

In [52]:
X_train, y_train = prepare_X_y(df_train)
X_val, y_val = df_val[['ram', 'storage', 'screen']].fillna(0), df_val['final_price']

model.fit(X_train, y_train)
y_pred_0 = model.predict(X_val)
rmse_0 = round(rmse(y_val, y_pred_0), 2)

In [54]:
mean_values = df_train[['ram', 'storage', 'screen']].mean()
X_train_mean = df_train[['ram', 'storage', 'screen']].fillna(mean_values)
X_val_mean = df_val[['ram', 'storage', 'screen']].fillna(mean_values)

In [56]:
model.fit(X_train_mean, y_train)
y_pred_mean = model.predict(X_val_mean)
rmse_mean = round(rmse(y_val, y_pred_mean), 2)

### Q3

In [57]:
print(f'RMSE when filling with 0: {rmse_0}')
print(f'RMSE when filling with mean: {rmse_mean}')

RMSE when filling with 0: 675.08
RMSE when filling with mean: 675.16


### Q4

In [63]:
from sklearn.linear_model import Ridge

In [64]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
best_rmse = float('inf')
best_r = None

In [69]:
for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    current_rmse = round(rmse(y_val, y_pred), 2)
    
    print(f'RMSE for r={r}: {current_rmse}')
    
    if current_rmse < best_rmse:
        best_rmse = current_rmse
        best_r = r

RMSE for r=0: 675.08
RMSE for r=0.01: 675.08
RMSE for r=0.1: 675.08
RMSE for r=1: 675.08
RMSE for r=5: 675.08
RMSE for r=10: 675.08
RMSE for r=100: 675.01


### Q5

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [73]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [74]:
seed_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

In [76]:
for seed in seed_values:
    # Split the dataset into 60% train, 20% validation, and 20% test sets
    df_train, df_temp = train_test_split(df, test_size=0.4, random_state=seed)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=seed)

    # Prepare input features (X) and target variable (y)
    X_train = df_train[['ram', 'storage', 'screen']].fillna(0)
    y_train = df_train['final_price']
    X_val = df_val[['ram', 'storage', 'screen']].fillna(0)
    y_val = df_val['final_price']

    # Train a linear regression model without regularization
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # Calculate RMSE and append to list
    current_rmse = rmse(y_val, y_pred)
    rmse_scores.append(current_rmse)

In [78]:
# Calculate the standard deviation of the RMSE scores
std_rmse = round(np.std(rmse_scores), 3)
print(f'Standard deviation of RMSE scores: {std_rmse}')

Standard deviation of RMSE scores: 29.442


### Q6

In [88]:
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=9)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=9)

In [89]:
df_train_val = pd.concat([df_train, df_val])

In [90]:
X_train_val = df_train_val[['ram', 'storage', 'screen']].fillna(0)
y_train_val = df_train_val['final_price']
X_test = df_test[['ram', 'storage', 'screen']].fillna(0)
y_test = df_test['final_price']

In [91]:
# Train a Ridge regression model with r=0.001
model = Ridge(alpha=0.001)
model.fit(X_train_val, y_train_val)

# Predict on the test set
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test set
rmse_test = round(rmse(y_test, y_pred_test), 2)

# Print the RMSE
print(f'RMSE on the test set: {rmse_test}')

RMSE on the test set: 537.15


In [87]:
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test set
rmse_test = round(rmse(y_test, y_pred_test), 2)

# Print the RMSE
print(f'RMSE on the test set: {rmse_test}')

RMSE on the test set: 537.15


In [92]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv"
df = pd.read_csv(url)

# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Select the required columns
columns = ['ram', 'storage', 'screen', 'final_price']
df = df[columns]

# Function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Split the dataset into 60% train, 20% validation, and 20% test sets using seed 9
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=9)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=9)

# Combine train and validation datasets
df_train_val = pd.concat([df_train, df_val])

# Prepare input features (X) and target variable (y)
X_train_val = df_train_val[['ram', 'storage', 'screen']].fillna(0)
y_train_val = df_train_val['final_price']
X_test = df_test[['ram', 'storage', 'screen']].fillna(0)
y_test = df_test['final_price']

# Train a Ridge regression model with r=0.001
model = Ridge(alpha=0.001)
model.fit(X_train_val, y_train_val)

# Predict on the test set
y_pred_test = model.predict(X_test)

# Calculate RMSE on the test set
rmse_test = round(rmse(y_test, y_pred_test), 2)

# Print the RMSE
print(f'RMSE on the test set: {rmse_test}')


RMSE on the test set: 602.43
