## Machine Learning for Regression

In [1]:
import pandas as pd
import numpy as np

## Data preparation

In [2]:
df = pd.read_csv('laptops.csv')

In [3]:
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
df.dtypes

laptop           object
status           object
brand            object
model            object
cpu              object
ram               int64
storage           int64
storage_type     object
gpu              object
screen          float64
touch            object
final_price     float64
dtype: object

In [6]:
df=df[['ram',
'storage',
'screen',
'final_price']]

In [7]:
df.head()

Unnamed: 0,ram,storage,screen,final_price
0,8,512,15.6,1009.0
1,8,256,15.6,299.0
2,8,256,15.6,789.0
3,16,1000,15.6,1199.0
4,16,512,15.6,669.01


In [8]:
df.isnull().sum()

ram            0
storage        0
screen         4
final_price    0
dtype: int64

In [9]:
print(df.ram.median())

16.0


## Setting up the validation framework

In [45]:
n = len(df)

n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

In [46]:
n

2160

In [47]:
n_val, n_test, n_train

(432, 432, 1296)

In [48]:
df_train = df.iloc[:n_train]
df_val = df.iloc[n_train:n_train+n_val]
df_test = df.iloc[n_train+n_val:]

In [49]:
idx = np.arange(n)

In [50]:
np.random.seed(42)
np.random.shuffle(idx)

In [51]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

In [52]:
df_train.head()

Unnamed: 0,ram,storage,screen,final_price
2079,32,1000,15.6,1123.29
668,4,64,14.1,201.05
2073,32,1000,14.0,997.74
1113,16,512,13.3,1016.0
788,32,1000,16.0,2739.0


In [53]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [54]:
y_train = df_train.final_price.values
y_val = df_val.final_price.values
y_test = df_test.final_price.values

In [55]:
del df_train['final_price']
del df_val['final_price']
del df_test['final_price']

## Linear regression

In [56]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [57]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [58]:
def prepare_X(df):
    df_num = df.fillna(0)
    X = df_num.values
    return X

In [59]:
X_train = prepare_X(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

np.float64(597.3635593619622)

In [60]:
def prepare_feature(df):
    df_num = df.fillna(df.mean())
    X = df_num.values
    return X

In [61]:
X_train = prepare_feature(df_train)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_feature(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

np.float64(600.2619483181334)

## Regularization

In [62]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [63]:
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

0 676.8954853003995 597.3635593619622
0.01 676.2736817205587 597.3616819856013
0.1 670.7284836314063 597.345159296362
1 619.9030834108207 597.2121215589519
5 463.7771697142356 597.0111186297033
10 352.79713367679835 597.0587680661115
100 66.92071440181994 597.9032640603043


## Using different seeds for data splitting 

In [36]:
# Function to split the data into train, validation, and test sets
def split_data(df, idx, n_train, n_val):
    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]
    return df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

In [37]:
# Function to prepare the target variable 
def prepare_target(df_train, df_val, df_test):
    y_train = df_train.final_price.values
    y_val = df_val.final_price.values
    y_test = df_test.final_price.values
    return y_train, y_val, y_test

In [38]:
# Function to remove the target variable from the dataframes
def remove_target(df_train, df_val, df_test):
    del df_train['final_price']
    del df_val['final_price']
    del df_test['final_price']
    return df_train, df_val, df_test

In [39]:
# Function to train and evaluate the model for a specific seed
def evaluate_model(df, idx, n_train, n_val, seed):
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    # Split the data
    df_train, df_val, df_test = split_data(df, idx, n_train, n_val)
    
    # Prepare target values
    y_train, y_val, y_test = prepare_target(df_train, df_val, df_test)
    
    # Remove target from the feature datasets
    df_train, df_val, df_test = remove_target(df_train, df_val, df_test)
    
    # Prepare the feature matrices
    X_train = prepare_X(df_train)
    X_val = prepare_X(df_val)
    
    # Train the linear regression model
    w0, w = train_linear_regression(X_train, y_train)
    
    # Make predictions on validation set
    y_pred = w0 + X_val.dot(w)
    
    # Calculate RMSE
    return rmse(y_val, y_pred)

In [40]:
# Main function to run the model evaluation across multiple seeds and calculate statistics
def run_experiment(df, idx, n_train, n_val, seeds):
    rmses = []
    
    for s in seeds:
        rmse_value = evaluate_model(df, idx, n_train, n_val, s)
        rmses.append(rmse_value)
    
    # Compute standard deviation of the RMSE results
    std_rmse = np.std(rmses)
    
    return rmses, std_rmse

In [41]:

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmses, std_rmse = run_experiment(df, idx, n_train, n_val, seeds)

# Output RMSE results and standard deviation
print("RMSE values:", rmses)
print("Standard deviation of RMSE:", std_rmse)


RMSE values: [np.float64(575.9392463666891), np.float64(565.5244895762405), np.float64(526.0005755651209), np.float64(616.313177990602), np.float64(550.9657785624672), np.float64(625.4427205786747), np.float64(615.1707929609928), np.float64(543.6495259068446), np.float64(557.756971029724), np.float64(610.514285789747)]
Standard deviation of RMSE: 33.66997372680491


## Training the model with training set and validation set combined and assessing performance on test set 

In [43]:
# Combine training and validation datasets
def combine_train_val(df_train, df_val, y_train, y_val):
    df_combined = pd.concat([df_train, df_val], axis=0).reset_index(drop=True)
    y_combined = np.concatenate([y_train, y_val], axis=0)
    return df_combined, y_combined

In [44]:



# Main function for this task
def run_experiment_with_combined_data(df, idx, n_train, n_val, seed, r=0.001):
    np.random.seed(seed)
    np.random.shuffle(idx)
    
    # Split data
    df_train, df_val, df_test = split_data(df, idx, n_train, n_val)
    
    # Prepare target values
    y_train, y_val, y_test = prepare_target(df_train, df_val, df_test)
    
    # Remove target from the feature datasets
    df_train, df_val, df_test = remove_target(df_train, df_val, df_test)
    
    # Combine train and validation datasets
    df_combined, y_combined = combine_train_val(df_train, df_val, y_train, y_val)
    
    # Fill missing values with 0
    df_combined = df_combined.fillna(0)
    df_test = df_test.fillna(0)
    
    # Prepare the feature matrices
    X_combined = prepare_X(df_combined)  # Assuming prepare_X is defined to transform features
    X_test = prepare_X(df_test)
    
    # Train the regularized linear regression model
    w0, w = train_linear_regression_reg(X_combined, y_combined, r=r)
    
    # Make predictions on the test set
    y_pred = w0 + X_test.dot(w)
    
    # Calculate RMSE on the test set
    test_rmse = rmse(y_test, y_pred)
    
    return test_rmse


seed = 9       # Use seed 9 for this task

# Call the function to get RMSE on the test set
test_rmse = run_experiment_with_combined_data(df, idx, n_train, n_val, seed, r=0.001)
print("RMSE on the test set:", test_rmse)


RMSE on the test set: 618.4960011442292
