1.Import Libraries and Load Cleaned Data

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

2.Load Cleaned Data

In [None]:
# [Cell 6]
# Load the *processed* dataset from the data_cleaning.ipynb file
try:
    df = pd.read_csv('../../data/processed/cleaned_laptop_data.csv')
    print("Processed data loaded successfully.")
except FileNotFoundError:
    print("ERROR: The file '../../data/processed/cleaned_laptop_data.csv' was not found.")
    print("Please make sure you have run the data_cleaning.ipynb notebook first to create this file.")
    # As a fallback, I'll load the file you provided, but it will have errors.
    # df = pd.read_csv('/workspaces/Laptop_Price_Predictor/notebooks/FC211007_Malanka/cleaned_laptop_data.csv')

df.head()

Unnamed: 0,Price,Processor_Name,RAM_Expandable,Ghz,GPU,RAM_GB,SSD_GB,Battery_Life_Hrs,Brand_AVITA,Brand_Acer,...,GPU_Brand_ARM,GPU_Brand_ATI,GPU_Brand_Apple,GPU_Brand_Intel,GPU_Brand_MediaTek,GPU_Brand_Microsoft,GPU_Brand_NIVIDIA,GPU_Brand_NVIDIA,GPU_Brand_Nvidia,GPU_Brand_Qualcomm
0,22990,MediaTek Octa-core,Not Expandable,2.0 Ghz Processor,Integrated Graphics,4,64.0,12.0,False,False,...,False,False,False,False,True,False,False,False,False,False
1,36289,AMD Hexa-Core Ryzen 5,12 GB Expandable,4.0 Ghz Processor,Radeon,8,512.0,11.0,False,False,...,False,False,False,False,False,False,False,False,False,False
2,78500,Intel Core i5 (12th Gen),32 GB Expandable,3.3 Ghz Processor,"GeForce RTX 3050 GPU, 4 GB",16,512.0,10.0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,55490,Intel Core i5 (12th Gen),8 GB Expandable,4.2 Ghz Processor,Iris Xe,8,512.0,7.3,False,False,...,False,False,False,True,False,False,False,False,False,False
4,21990,Intel Core i3 (11th Gen),Not Expandable,1.7 Ghz Processor,UHD,8,512.0,8.0,False,False,...,False,False,False,True,False,False,False,False,False,False


3.Define Feature Matrix (X) and Target (y)

In [None]:

# We drop 'Price' to create our feature matrix X
X = df.drop('Price', axis=1)

# We log-transform the target 'Price' to normalize its distribution,
# which helps linear models perform much better.
y = np.log1p(df['Price'])

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")

4.Create Training and Test Sets

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

5.Set Up Preprocessing Pipeline (Scaling)

In [None]:
# [Cell 9]
# 1. Identify columns that need scaling
# These are all our numerical and ordinal features
numerical_ordinal_cols = [
    'RAM_Expandable', 'RAM', 'RAM_TYPE(DDR)', 'Display_type',
    'Processor_Tier', 'Processor_Speed(Ghz)', 'Display_Tier', 'GPU_Tier',
    'SSD(GB)', 'HDD(GB)'
]

# 2. Identify columns to pass through (our one-hot encoded features)
passthrough_cols = [col for col in X.columns if col not in numerical_ordinal_cols]

# 3. Create the ColumnTransformer
# This applies StandardScaler to the numerical/ordinal columns
# and does nothing ('passthrough') to the one-hot encoded columns.
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), numerical_ordinal_cols)
    ],
    remainder='passthrough' # Passes OHE columns through without scaling
)