In [510]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import joblib

In [511]:
#Loading Dataset
df = pd.read_csv("dataset/laptop_price.csv")

In [512]:
df

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.00
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,638.00
1299,1317,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,1499.00
1300,1318,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,229.00
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.00


In [513]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


In [514]:
df['Is_IPS'] = df["ScreenResolution"].str.contains('IPS Panel', case=False)
df['Is_Retina'] = df["ScreenResolution"].str.contains('Retina Display', case=False)
df['Is_Touchscreen'] = df["ScreenResolution"].str.contains('Touchscreen', case=False)
df['Is_4K'] = df["ScreenResolution"].str.contains('4K Ultra HD', case=False)
df['Is_QuadHDPlus'] = df["ScreenResolution"].str.contains('Quad HD+', case=False)
df["ScreenResolution"] = df["ScreenResolution"].str.extract(r"(\d+x\d+)" , expand=False)
df[["ResX" , "ResY"]] = df["ScreenResolution"].str.split("x" ,expand=True).astype(int)

Data cleaning/preprocessing

In [515]:
df.drop("laptop_ID", axis=1 , inplace=True) #This feature is irrelevant
df['Cpu_brand'] = df['Cpu'].apply(lambda x: x.split()[0]) #this extracts the CPU brand e.g intel
df['Cpu_speed'] = df['Cpu'].str.extract(r'(\d+\.\d+)').astype(float) #extracts the speed of the cpu
df['Cpu_Series'] = df['Cpu'].str.extract(r'(i[3579])', expand=False)# extracts cpu series
df.drop("Cpu" , axis=1 , inplace=True)
df.drop("Weight" ,axis=1 , inplace=True)
df["Ram"] = df["Ram"].str.replace("GB", "").astype(int)
df.drop("Product" , axis=1 , inplace=True)
for col in ["Company", "TypeName", "Gpu", "OpSys", "Cpu_brand", "Cpu_Series"]:
    df[col] = df[col].fillna("Unknown")



array(['Intel', 'AMD', 'Samsung'], dtype=object)

In [516]:
def convert_to_gb(mem):
    mem = mem.replace('TB', '000GB')  # Replaces TB with 000GB
    return mem
df['Memory'] = df['Memory'].apply(convert_to_gb)
df['Memory'] = df['Memory'].str.replace('+', ',')
memory_split = df['Memory'].str.split(',', expand=True)
import re


#This extracts the meomry and stores it in a column for each memory type so it can be mpore useful for model interpretation
def extract_memory(mem_str, type_):
    if pd.isnull(mem_str):
        return 0
    if type_ in mem_str:
        size = int(re.search(r'(\d+)GB', mem_str).group(1))
        return size
    return 0

df['SSD'] = memory_split[0].apply(lambda x: extract_memory(x, 'SSD')) + \
            memory_split[1].apply(lambda x: extract_memory(x, 'SSD') if x else 0)

df['HDD'] = memory_split[0].apply(lambda x: extract_memory(x, 'HDD')) + \
            memory_split[1].apply(lambda x: extract_memory(x, 'HDD') if x else 0)
df['Flash_Storage'] = memory_split[0].apply(lambda x: extract_memory(x, 'Flash')) + \
                      memory_split[1].apply(lambda x: extract_memory(x, 'Flash') if x else 0)

df['Hybrid'] = memory_split[0].apply(lambda x: extract_memory(x, 'Hybrid')) + \
               memory_split[1].apply(lambda x: extract_memory(x, 'Hybrid') if x else 0)
df.drop("Memory" , axis=1 , inplace=True)
df.drop("ScreenResolution" , axis=1, inplace=True)

In [517]:
df["Gpu_brand"] = df["Gpu"].apply(lambda x: x.split()[0])
df.drop("Gpu" , axis=1 , inplace=True)

In [518]:
df["Cpu_Series"] = df["Cpu_Series"].fillna("Unknown")
df = df.dropna(subset=["Cpu_speed"])


Data Spliting 

In [519]:

X = df.drop("Price_euros" , axis=1) #Features for training
Y = df["Price_euros"] #target

X_train , X_test , Y_train , Y_test = train_test_split(X, Y, test_size=0.15 , random_state=4)

In [520]:
# grouping features into numeric and categorical type
categorical = ["Company", "TypeName", "OpSys", "Gpu_brand" , "Cpu_brand", "Cpu_Series",]
numeric = ["Inches", "Ram", "Cpu_speed", "SSD", "HDD", "Flash_Storage", "Hybrid", "Is_IPS" , "Is_Retina" , "Is_Touchscreen" , "Is_4K"]
target = ["Price_euros"]

In [521]:
from sklearn.preprocessing import StandardScaler

# Select numeric columns
num_cols = [
    "Inches", "Ram", "Cpu_speed", "SSD", "HDD", "Flash_Storage", "Hybrid", "Is_IPS" , "Is_Retina" , "Is_Touchscreen" , "Is_4K"
]

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[num_cols] = scaler.fit_transform(df[num_cols])


In [522]:
#Preprocessor
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numeric)
    ])


In [523]:
#pipeline with linear regression
model = Pipeline([
    ("preprocess", preprocessor),
    ("regressor", LinearRegression())
])

In [524]:
#train the model
model.fit(X_train, Y_train)

0,1,2
,steps,"[('preprocess', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [525]:
#testing model
Y_pred = model.predict(X_test)

In [526]:
from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score
print("MAE:" , mean_absolute_error(Y_test , Y_pred))
print("MSE:" , mean_squared_error(Y_test , Y_pred))
print("RMSE:" , np.sqrt(mean_squared_error(Y_test , Y_pred)))
print("R2 Score:" , r2_score(Y_test, Y_pred))

MAE: 259.5537634517095
MSE: 132596.8380658973
RMSE: 364.1384874823002
R2 Score: 0.7092736221215283


In [527]:
import joblib

# Save model and scaler
joblib.dump(model, "laptop_price_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']