In [7]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [8]:
df = pd.read_csv('C:\\Users\\Muhammad Elbaklishy\\Downloads\\laptops.csv')

In [9]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [10]:
# Select relevant columns
df = df[['ram', 'storage', 'screen', 'final_price']]

In [11]:
missing_values = df.isnull().sum()
print(missing_values)

ram            0
storage        0
screen         4
final_price    0
dtype: int64


In [12]:
median_ram = df['ram'].median()
print("Median RAM:", median_ram)

Median RAM: 16.0


In [13]:
# Shuffle and split the dataset
np.random.seed(42)
n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]
df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
df_test = df_shuffled.iloc[n_train + n_val:].copy()

In [14]:
# Fill with 0
df_train_zero = df_train.fillna(0)
df_val_zero = df_val.fillna(0)

X_train_zero = df_train_zero[['ram', 'storage', 'screen']].values
y_train_zero = df_train_zero['final_price'].values

# Function to train linear regression without regularization
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

# Train with zero-filled data
w_0_zero, w_zero = train_linear_regression(X_train_zero, y_train_zero)

# Make predictions
X_val_zero = df_val_zero[['ram', 'storage', 'screen']].values
y_pred_zero = w_0_zero + X_val_zero.dot(w_zero)
rmse_zero = np.sqrt(np.mean((df_val_zero['final_price'].values - y_pred_zero) ** 2))

# Fill with mean
mean_ram = df_train['ram'].mean()
df_train_mean = df_train.fillna(mean_ram)
df_val_mean = df_val.fillna(mean_ram)

X_train_mean = df_train_mean[['ram', 'storage', 'screen']].values
y_train_mean = df_train_mean['final_price'].values

# Train with mean-filled data
w_0_mean, w_mean = train_linear_regression(X_train_mean, y_train_mean)

# Make predictions
X_val_mean = df_val_mean[['ram', 'storage', 'screen']].values
y_pred_mean = w_0_mean + X_val_mean.dot(w_mean)
rmse_mean = np.sqrt(np.mean((df_val_mean['final_price'].values - y_pred_mean) ** 2))

# Print RMSE
print(f"RMSE with 0: {round(rmse_zero, 2)}")
print(f"RMSE with mean: {round(rmse_mean, 2)}")

# Determine the best option
best_option = "With 0" if rmse_zero < rmse_mean else "With mean" if rmse_zero > rmse_mean else "Both are equally good"
print(f"Best option: {best_option}")

RMSE with 0: 597.36
RMSE with mean: 600.36
Best option: With 0


In [15]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
    
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

# Evaluate different r values
print("\nRegularized Linear Regression Results:")
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train_zero, y_train_zero, r=r)
    
    # Make predictions on the validation set
    y_pred = w_0 + X_val_zero.dot(w)
    print(f'r={r}, RMSE={np.sqrt(np.mean((df_val_zero["final_price"].values - y_pred) ** 2)):.2f}')


Regularized Linear Regression Results:
r=0, RMSE=597.36
r=0.01, RMSE=597.36
r=0.1, RMSE=597.35
r=1, RMSE=597.21
r=5, RMSE=597.01
r=10, RMSE=597.06
r=100, RMSE=597.90


In [16]:
rmse_scores = []

for seed in range(10):
    # Shuffle the dataset with the given seed
    np.random.seed(seed)
    idx = np.arange(len(df))
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]
    
    # Calculate sizes of train, validation, and test sets
    n = len(df)
    n_train = int(0.6 * n)  # 60% for training
    n_val = int(0.2 * n)    # 20% for validation
    n_test = n - (n_train + n_val)  # Remaining 20% for test
    
    # Split dataset into training, validation, and test sets
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
    df_test = df_shuffled.iloc[n_train + n_val:].copy()
    
    # Fill missing values with 0
    df_train = df_train.fillna(0)
    df_val = df_val.fillna(0)
    df_test = df_test.fillna(0)
    
    # Extract input features and target variable for training
    X_train = df_train[['ram', 'storage', 'screen']].values
    y_train = df_train['final_price'].values
    
    # Train the model
    w_0, w = train_linear_regression(X_train, y_train)
    
    # Make predictions on the validation set
    X_val = df_val[['ram', 'storage', 'screen']].values
    y_pred = w_0 + X_val.dot(w)
    
    # Calculate RMSE and append to the list
    rmse = np.sqrt(np.mean((df_val['final_price'].values - y_pred) ** 2))
    rmse_scores.append(rmse)

# Calculate standard deviation of RMSE scores
std_dev = np.std(rmse_scores)
print(f"Standard deviation of RMSE scores: {round(std_dev, 3)}")

Standard deviation of RMSE scores: 29.176


In [17]:
# Shuffle the dataset
np.random.seed(9)
idx = np.arange(len(df))
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]

# Define sizes for train, validation, and test sets
n = len(df)
n_train = int(0.6 * n)  # 60% for training
n_val = int(0.2 * n)    # 20% for validation
n_test = n - (n_train + n_val)  # Remaining for test

# Check if there is enough data
if n_train < 1 or n_val < 1 or n_test < 1:
    raise ValueError("Not enough data to create training, validation, and test sets.")

# Split the dataset
df_train = df_shuffled.iloc[:n_train].fillna(0)
df_val = df_shuffled.iloc[n_train:n_train + n_val].fillna(0)
df_test = df_shuffled.iloc[n_train + n_val:].fillna(0)

# Print the sizes of the datasets
print(f"Train size: {len(df_train)}, Validation size: {len(df_val)}, Test size: {len(df_test)}")

# Prepare the combined training and validation dataset
df_combined = pd.concat([df_train, df_val])

X_train_combined = df_combined[['ram', 'storage', 'screen']].values
y_train_combined = df_combined['final_price'].values

# Train model with r=0.001
w_0_combined, w_combined = train_linear_regression_reg(X_train_combined, y_train_combined, r=0.001)

# Prepare the test dataset
# The test dataset should already have data after the split
X_test = df_test[['ram', 'storage', 'screen']].fillna(0).values

# Check if the test dataset is empty
if df_test.empty:
    raise ValueError("Test dataset is empty.")

# Make predictions on the test dataset
y_pred_test = w_0_combined + X_test.dot(w_combined)

# Calculate RMSE
rmse_test = np.sqrt(np.mean((df_test['final_price'].values - y_pred_test) ** 2))
print(f"RMSE on test dataset: {round(rmse_test, 2)}")

Train size: 1296, Validation size: 432, Test size: 432
RMSE on test dataset: 608.61
