In [3783]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [3784]:
df = pd.read_csv(r"../../dataset/laptopData.csv")

In [3785]:
df.isnull().sum()

Unnamed: 0          30
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64

In [3786]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
df_train

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
1158,1158.0,Lenovo,Notebook,15.6,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8GB,1.0TB Hybrid,AMD Radeon R5 M330,Windows 10,2.5kg,42010.7472
904,904.0,Lenovo,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,512GB SSD,Intel HD Graphics 620,Windows 10,1.95kg,104588.1072
439,439.0,Asus,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,4GB,256GB SSD,Intel HD Graphics 620,Linux,2kg,29783.5200
581,581.0,Dell,Notebook,15.6,1366x768,Intel Core i5 7300U 2.6GHz,8GB,500GB HDD,Intel HD Graphics 620,Windows 10,1.9kg,53733.9456
676,676.0,Asus,Ultrabook,14,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.1kg,60472.8000
...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1095.0,Acer,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 950M,Windows 10,2.4kg,42570.7200
1130,1130.0,HP,Notebook,15.6,1366x768,Intel Core i7 7500U 2.7GHz,8GB,2TB HDD,Intel HD Graphics 620,Windows 10,2.04kg,33513.1200
1294,1294.0,HP,Notebook,15.6,Full HD 1920x1080,AMD A9-Series 9410 2.9GHz,6GB,1.0TB Hybrid,AMD Radeon R7 M440,Windows 10,2.04kg,29303.4672
860,860.0,Lenovo,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,1TB HDD,Intel HD Graphics 620,Windows 10,2.3kg,45323.1648


In [3787]:
df_train.isnull().sum()

Unnamed: 0          23
Company             23
TypeName            23
Inches              23
ScreenResolution    23
Cpu                 23
Ram                 23
Memory              23
Gpu                 23
OpSys               23
Weight              23
Price               23
dtype: int64

In [3788]:
df_train = df_train.drop(columns="Unnamed: 0")

In [3789]:
df_test = df_test.drop(columns="Unnamed: 0")

In [3790]:
df_train.isnull().sum()

Company             23
TypeName            23
Inches              23
ScreenResolution    23
Cpu                 23
Ram                 23
Memory              23
Gpu                 23
OpSys               23
Weight              23
Price               23
dtype: int64

In [3791]:
df_train = df_train.replace("?", np.nan)

In [3792]:
df_test = df_test.replace("?", np.nan)

In [3793]:
df_train = df_train.dropna()

In [3794]:
df_test = df_test.dropna()

In [3795]:
df_train['Ram'] = df_train['Ram'].str.replace("GB", "")
df_train['Ram'] = pd.to_numeric(df_train['Ram'], errors='coerce')

In [3796]:
df_test['Ram'] = df_test['Ram'].str.replace("GB", "")
df_test['Ram'] = pd.to_numeric(df_test['Ram'], errors='coerce')

In [3797]:
df_train['Weight'] = df_train['Weight'].str.replace("kg", "")
df_train['Weight'] = pd.to_numeric(df_train['Weight'], errors='coerce')

In [3798]:
df_test['Weight'] = df_test['Weight'].str.replace("kg", "")
df_test['Weight'] = pd.to_numeric(df_test['Weight'], errors='coerce')

In [3799]:
def split_memory_column(dataframe, column_name):
    # Split the column into two new columns
    dataframe[['Memory', 'Memory_Type']] = dataframe[column_name].str.split(' ', n=1, expand=True)
    return dataframe

In [3800]:
df_train = split_memory_column(df_train, 'Memory')
df_train

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Memory_Type
1158,Lenovo,Notebook,15.6,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8,1.0TB,AMD Radeon R5 M330,Windows 10,2.50,42010.7472,Hybrid
904,Lenovo,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,512GB,Intel HD Graphics 620,Windows 10,1.95,104588.1072,SSD
439,Asus,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,4,256GB,Intel HD Graphics 620,Linux,2.00,29783.5200,SSD
581,Dell,Notebook,15.6,1366x768,Intel Core i5 7300U 2.6GHz,8,500GB,Intel HD Graphics 620,Windows 10,1.90,53733.9456,HDD
676,Asus,Ultrabook,14,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB,Intel HD Graphics 620,Windows 10,1.10,60472.8000,SSD
...,...,...,...,...,...,...,...,...,...,...,...,...
1095,Acer,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB,Nvidia GeForce GTX 950M,Windows 10,2.40,42570.7200,SSD + 1TB HDD
1130,HP,Notebook,15.6,1366x768,Intel Core i7 7500U 2.7GHz,8,2TB,Intel HD Graphics 620,Windows 10,2.04,33513.1200,HDD
1294,HP,Notebook,15.6,Full HD 1920x1080,AMD A9-Series 9410 2.9GHz,6,1.0TB,AMD Radeon R7 M440,Windows 10,2.04,29303.4672,Hybrid
860,Lenovo,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,1TB,Intel HD Graphics 620,Windows 10,2.30,45323.1648,HDD


In [3801]:
def split_memory_unit(dataframe, column_name):
    dataframe['Memory_Capacity'] = dataframe[column_name].str.extract('(\d+)')
    dataframe['Memory_Unit'] = dataframe[column_name].str.extract('([A-Za-z]+)')
    return dataframe

  dataframe['Memory_Capacity'] = dataframe[column_name].str.extract('(\d+)')


In [3802]:
df_train = split_memory_unit(df_train, 'Memory')
df_train['Memory_Capacity'] = pd.to_numeric(df_train['Memory_Capacity'], errors='coerce')
df_train = df_train.drop(columns="Memory")
df_train

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit
1158,Lenovo,Notebook,15.6,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8,AMD Radeon R5 M330,Windows 10,2.50,42010.7472,Hybrid,1,TB
904,Lenovo,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.95,104588.1072,SSD,512,GB
439,Asus,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,4,Intel HD Graphics 620,Linux,2.00,29783.5200,SSD,256,GB
581,Dell,Notebook,15.6,1366x768,Intel Core i5 7300U 2.6GHz,8,Intel HD Graphics 620,Windows 10,1.90,53733.9456,HDD,500,GB
676,Asus,Ultrabook,14,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.10,60472.8000,SSD,256,GB
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,Acer,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Nvidia GeForce GTX 950M,Windows 10,2.40,42570.7200,SSD + 1TB HDD,256,GB
1130,HP,Notebook,15.6,1366x768,Intel Core i7 7500U 2.7GHz,8,Intel HD Graphics 620,Windows 10,2.04,33513.1200,HDD,2,TB
1294,HP,Notebook,15.6,Full HD 1920x1080,AMD A9-Series 9410 2.9GHz,6,AMD Radeon R7 M440,Windows 10,2.04,29303.4672,Hybrid,1,TB
860,Lenovo,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,2.30,45323.1648,HDD,1,TB


In [3803]:
df_test = split_memory_column(df_test, 'Memory')
df_test = split_memory_unit(df_test, 'Memory')
df_test['Memory_Capacity'] = pd.to_numeric(df_test['Memory_Capacity'], errors='coerce')
df_test = df_test.drop(columns="Memory")
df_test

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit
479,Toshiba,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.05,89084.160,SSD,256,GB
1022,HP,Notebook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.11,61218.720,SSD,256,GB
298,Lenovo,Notebook,15.6,Full HD 1920x1080,AMD A10-Series 9600P 2.4GHz,6,AMD Radeon R5 430,Windows 10,2.40,26586.720,HDD,1,TB
1265,Lenovo,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8,Nvidia GeForce GTX 960M,Windows 10,2.60,47898.720,HDD,1,TB
582,HP,Notebook,13.3,Full HD 1920x1080,Intel Core i3 7100U 2.4GHz,4,Intel HD Graphics 620,Windows 10,1.49,38308.320,SSD,128,GB
...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,Lenovo,2 in 1 Convertible,13.3,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.30,55091.520,SSD,256,GB
506,Asus,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8,Intel HD Graphics 620,Windows 10,2.00,65214.720,SSD + 1TB HDD,256,GB
668,Toshiba,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,2.00,67026.240,SSD,256,GB
778,Razer,Gaming,14,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16,Nvidia GeForce GTX 1060,Windows 10,1.95,154458.720,SSD,512,GB


In [3804]:
def categorize_resolution(dataframe, column_name):
    # Mapping of resolution to descriptive labels based on provided dataset
    resolution_dict = {
        '1366x768': 'HD',
        '1920x1080': 'Full HD',
        '2560x1440': 'Quad HD',
        '3200x1800': 'Quad HD+',
        '3840x2160': '4K Ultra HD'
    }

    # Directly map extracted resolutions to categories
    extracted_resolutions = dataframe[column_name].str.extract(r'(\d+x\d+)')[0]
    dataframe['Resolution_Category'] = extracted_resolutions.map(resolution_dict).fillna('Unknown')

    return dataframe

In [3805]:
df_train = categorize_resolution(df_train, "ScreenResolution")
df_train[['Width', 'Height']] = df_train['ScreenResolution'].str.extract('(\d+)x(\d+)')
df_train['Width'] = pd.to_numeric(df_train['Width'], errors='coerce')
df_train['Height'] = pd.to_numeric(df_train['Height'], errors='coerce')
df_train = df_train.drop(columns="ScreenResolution")
df_train

  df_train[['Width', 'Height']] = df_train['ScreenResolution'].str.extract('(\d+)x(\d+)')


Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height
1158,Lenovo,Notebook,15.6,Intel Core i5 6200U 2.3GHz,8,AMD Radeon R5 M330,Windows 10,2.50,42010.7472,Hybrid,1,TB,Full HD,1920,1080
904,Lenovo,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.95,104588.1072,SSD,512,GB,Full HD,1920,1080
439,Asus,Notebook,15.6,Intel Core i5 7200U 2.5GHz,4,Intel HD Graphics 620,Linux,2.00,29783.5200,SSD,256,GB,Full HD,1920,1080
581,Dell,Notebook,15.6,Intel Core i5 7300U 2.6GHz,8,Intel HD Graphics 620,Windows 10,1.90,53733.9456,HDD,500,GB,HD,1366,768
676,Asus,Ultrabook,14,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.10,60472.8000,SSD,256,GB,Full HD,1920,1080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,Acer,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Nvidia GeForce GTX 950M,Windows 10,2.40,42570.7200,SSD + 1TB HDD,256,GB,Full HD,1920,1080
1130,HP,Notebook,15.6,Intel Core i7 7500U 2.7GHz,8,Intel HD Graphics 620,Windows 10,2.04,33513.1200,HDD,2,TB,HD,1366,768
1294,HP,Notebook,15.6,AMD A9-Series 9410 2.9GHz,6,AMD Radeon R7 M440,Windows 10,2.04,29303.4672,Hybrid,1,TB,Full HD,1920,1080
860,Lenovo,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,2.30,45323.1648,HDD,1,TB,Full HD,1920,1080


In [3806]:
df_test = categorize_resolution(df_test, "ScreenResolution")
df_test[['Width', 'Height']] = df_test['ScreenResolution'].str.extract('(\d+)x(\d+)')
df_test['Width'] = pd.to_numeric(df_test['Width'], errors='coerce')
df_test['Height'] = pd.to_numeric(df_test['Height'], errors='coerce')
df_test = df_test.drop(columns="ScreenResolution")
df_test

  df_test[['Width', 'Height']] = df_test['ScreenResolution'].str.extract('(\d+)x(\d+)')


Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height
479,Toshiba,Notebook,13.3,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.05,89084.160,SSD,256,GB,Full HD,1920,1080
1022,HP,Notebook,13.3,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.11,61218.720,SSD,256,GB,Full HD,1920,1080
298,Lenovo,Notebook,15.6,AMD A10-Series 9600P 2.4GHz,6,AMD Radeon R5 430,Windows 10,2.40,26586.720,HDD,1,TB,Full HD,1920,1080
1265,Lenovo,Notebook,15.6,Intel Core i7 6700HQ 2.6GHz,8,Nvidia GeForce GTX 960M,Windows 10,2.60,47898.720,HDD,1,TB,Full HD,1920,1080
582,HP,Notebook,13.3,Intel Core i3 7100U 2.4GHz,4,Intel HD Graphics 620,Windows 10,1.49,38308.320,SSD,128,GB,Full HD,1920,1080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,Lenovo,2 in 1 Convertible,13.3,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,1.30,55091.520,SSD,256,GB,Full HD,1920,1080
506,Asus,Notebook,15.6,Intel Core i7 7500U 2.7GHz,8,Intel HD Graphics 620,Windows 10,2.00,65214.720,SSD + 1TB HDD,256,GB,Full HD,1920,1080
668,Toshiba,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,Windows 10,2.00,67026.240,SSD,256,GB,Full HD,1920,1080
778,Razer,Gaming,14,Intel Core i7 7700HQ 2.8GHz,16,Nvidia GeForce GTX 1060,Windows 10,1.95,154458.720,SSD,512,GB,Full HD,1920,1080


In [3807]:
def split_GPU_column(dataframe, column_name):
    # Split the column into two new columns based on the last space
    dataframe[['Gpu', 'GPU_Ghz']] = dataframe[column_name].str.rsplit(' ', n=1, expand=True)
    return dataframe

In [3808]:
df_train = split_GPU_column(df_train, "Gpu")
# df_train['GPU_Ghz'] = df_train['GPU_Ghz'].str.replace(r'[^\d.]', '', regex=True)
# df_train['GPU_Ghz'] = pd.to_numeric(df_train['GPU_Ghz'], errors='coerce')
df_train

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height,GPU_Ghz
1158,Lenovo,Notebook,15.6,Intel Core i5 6200U 2.3GHz,8,AMD Radeon R5,Windows 10,2.50,42010.7472,Hybrid,1,TB,Full HD,1920,1080,M330
904,Lenovo,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics,Windows 10,1.95,104588.1072,SSD,512,GB,Full HD,1920,1080,620
439,Asus,Notebook,15.6,Intel Core i5 7200U 2.5GHz,4,Intel HD Graphics,Linux,2.00,29783.5200,SSD,256,GB,Full HD,1920,1080,620
581,Dell,Notebook,15.6,Intel Core i5 7300U 2.6GHz,8,Intel HD Graphics,Windows 10,1.90,53733.9456,HDD,500,GB,HD,1366,768,620
676,Asus,Ultrabook,14,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics,Windows 10,1.10,60472.8000,SSD,256,GB,Full HD,1920,1080,620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,Acer,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Nvidia GeForce GTX,Windows 10,2.40,42570.7200,SSD + 1TB HDD,256,GB,Full HD,1920,1080,950M
1130,HP,Notebook,15.6,Intel Core i7 7500U 2.7GHz,8,Intel HD Graphics,Windows 10,2.04,33513.1200,HDD,2,TB,HD,1366,768,620
1294,HP,Notebook,15.6,AMD A9-Series 9410 2.9GHz,6,AMD Radeon R7,Windows 10,2.04,29303.4672,Hybrid,1,TB,Full HD,1920,1080,M440
860,Lenovo,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics,Windows 10,2.30,45323.1648,HDD,1,TB,Full HD,1920,1080,620


In [3809]:
df_test = split_GPU_column(df_test, "Gpu")
# df_test['GPU_Ghz'] = df_test['GPU_Ghz'].str.replace(r'[^\d.]', '', regex=True)
# df_test['GPU_Ghz'] = pd.to_numeric(df_test['GPU_Ghz'], errors='coerce')
df_test

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height,GPU_Ghz
479,Toshiba,Notebook,13.3,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics,Windows 10,1.05,89084.160,SSD,256,GB,Full HD,1920,1080,620
1022,HP,Notebook,13.3,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics,Windows 10,1.11,61218.720,SSD,256,GB,Full HD,1920,1080,620
298,Lenovo,Notebook,15.6,AMD A10-Series 9600P 2.4GHz,6,AMD Radeon R5,Windows 10,2.40,26586.720,HDD,1,TB,Full HD,1920,1080,430
1265,Lenovo,Notebook,15.6,Intel Core i7 6700HQ 2.6GHz,8,Nvidia GeForce GTX,Windows 10,2.60,47898.720,HDD,1,TB,Full HD,1920,1080,960M
582,HP,Notebook,13.3,Intel Core i3 7100U 2.4GHz,4,Intel HD Graphics,Windows 10,1.49,38308.320,SSD,128,GB,Full HD,1920,1080,620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,Lenovo,2 in 1 Convertible,13.3,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics,Windows 10,1.30,55091.520,SSD,256,GB,Full HD,1920,1080,620
506,Asus,Notebook,15.6,Intel Core i7 7500U 2.7GHz,8,Intel HD Graphics,Windows 10,2.00,65214.720,SSD + 1TB HDD,256,GB,Full HD,1920,1080,620
668,Toshiba,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics,Windows 10,2.00,67026.240,SSD,256,GB,Full HD,1920,1080,620
778,Razer,Gaming,14,Intel Core i7 7700HQ 2.8GHz,16,Nvidia GeForce GTX,Windows 10,1.95,154458.720,SSD,512,GB,Full HD,1920,1080,1060


In [3810]:
def split_CPU_column(dataframe, column_name):
    # Split the column into two new columns based on the last space
    dataframe[['Cpu', 'Cpu_Ghz']] = dataframe[column_name].str.rsplit(' ', n=1, expand=True)
    return dataframe

In [3811]:
df_train = split_CPU_column(df_train, "Cpu")
df_train

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height,GPU_Ghz,Cpu_Ghz
1158,Lenovo,Notebook,15.6,Intel Core i5 6200U,8,AMD Radeon R5,Windows 10,2.50,42010.7472,Hybrid,1,TB,Full HD,1920,1080,M330,2.3GHz
904,Lenovo,Notebook,15.6,Intel Core i5 7200U,8,Intel HD Graphics,Windows 10,1.95,104588.1072,SSD,512,GB,Full HD,1920,1080,620,2.5GHz
439,Asus,Notebook,15.6,Intel Core i5 7200U,4,Intel HD Graphics,Linux,2.00,29783.5200,SSD,256,GB,Full HD,1920,1080,620,2.5GHz
581,Dell,Notebook,15.6,Intel Core i5 7300U,8,Intel HD Graphics,Windows 10,1.90,53733.9456,HDD,500,GB,HD,1366,768,620,2.6GHz
676,Asus,Ultrabook,14,Intel Core i5 7200U,8,Intel HD Graphics,Windows 10,1.10,60472.8000,SSD,256,GB,Full HD,1920,1080,620,2.5GHz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,Acer,Notebook,15.6,Intel Core i5 7200U,8,Nvidia GeForce GTX,Windows 10,2.40,42570.7200,SSD + 1TB HDD,256,GB,Full HD,1920,1080,950M,2.5GHz
1130,HP,Notebook,15.6,Intel Core i7 7500U,8,Intel HD Graphics,Windows 10,2.04,33513.1200,HDD,2,TB,HD,1366,768,620,2.7GHz
1294,HP,Notebook,15.6,AMD A9-Series 9410,6,AMD Radeon R7,Windows 10,2.04,29303.4672,Hybrid,1,TB,Full HD,1920,1080,M440,2.9GHz
860,Lenovo,Notebook,15.6,Intel Core i5 7200U,8,Intel HD Graphics,Windows 10,2.30,45323.1648,HDD,1,TB,Full HD,1920,1080,620,2.5GHz


In [3812]:
df_test = split_CPU_column(df_test, "Cpu")
df_test

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height,GPU_Ghz,Cpu_Ghz
479,Toshiba,Notebook,13.3,Intel Core i5 7200U,8,Intel HD Graphics,Windows 10,1.05,89084.160,SSD,256,GB,Full HD,1920,1080,620,2.5GHz
1022,HP,Notebook,13.3,Intel Core i5 7200U,8,Intel HD Graphics,Windows 10,1.11,61218.720,SSD,256,GB,Full HD,1920,1080,620,2.5GHz
298,Lenovo,Notebook,15.6,AMD A10-Series 9600P,6,AMD Radeon R5,Windows 10,2.40,26586.720,HDD,1,TB,Full HD,1920,1080,430,2.4GHz
1265,Lenovo,Notebook,15.6,Intel Core i7 6700HQ,8,Nvidia GeForce GTX,Windows 10,2.60,47898.720,HDD,1,TB,Full HD,1920,1080,960M,2.6GHz
582,HP,Notebook,13.3,Intel Core i3 7100U,4,Intel HD Graphics,Windows 10,1.49,38308.320,SSD,128,GB,Full HD,1920,1080,620,2.4GHz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,Lenovo,2 in 1 Convertible,13.3,Intel Core i5 7200U,8,Intel HD Graphics,Windows 10,1.30,55091.520,SSD,256,GB,Full HD,1920,1080,620,2.5GHz
506,Asus,Notebook,15.6,Intel Core i7 7500U,8,Intel HD Graphics,Windows 10,2.00,65214.720,SSD + 1TB HDD,256,GB,Full HD,1920,1080,620,2.7GHz
668,Toshiba,Notebook,15.6,Intel Core i5 7200U,8,Intel HD Graphics,Windows 10,2.00,67026.240,SSD,256,GB,Full HD,1920,1080,620,2.5GHz
778,Razer,Gaming,14,Intel Core i7 7700HQ,16,Nvidia GeForce GTX,Windows 10,1.95,154458.720,SSD,512,GB,Full HD,1920,1080,1060,2.8GHz


In [3813]:
df_train['Cpu_Ghz'] = df_train['Cpu_Ghz'].str.replace("GHz", "")
df_train['Cpu_Ghz'] = pd.to_numeric(df_train['Cpu_Ghz'], errors='coerce')

In [3814]:
df_test['Cpu_Ghz'] = df_test['Cpu_Ghz'].str.replace("GHz", "")
df_test['Cpu_Ghz'] = pd.to_numeric(df_test['Cpu_Ghz'], errors='coerce')

In [3815]:
# def split_CPU1_column(dataframe, column_name):
#     # Split the column into two new columns based on the last space
#     dataframe[['Cpu', 'Cpu_model']] = dataframe[column_name].str.rsplit(' ', n=1, expand=True)
#     return dataframe

In [3816]:
# df_test = split_CPU1_column(df_test, "Cpu")
# df_test

In [3817]:
# df_train = split_CPU1_column(df_train, "Cpu")
# df_train

In [3818]:
# from sklearn.preprocessing import MinMaxScaler
# # Scaling for columns except the target (Price) column
# numeric_columns = ['Inches', 'Ram', 'Weight', 'Memory_Capacity' , 'Cpu_Ghz']
# for col in numeric_columns:
#     df_train[col] = pd.to_numeric(df_train[col], errors='coerce')

# scaler = MinMaxScaler()
# df_train[numeric_columns] = scaler.fit_transform(df_train[numeric_columns])

In [3819]:
# # Scaling (transform only) for test df except the target (Price) column
# numeric_columns = ['Inches', 'Ram', 'Weight', 'Memory_Capacity' , 'Cpu_Ghz']
# for col in numeric_columns:
#     df_test[col] = pd.to_numeric(df_test[col], errors='coerce')

# df_test[numeric_columns] = scaler.transform(df_test[numeric_columns])

In [3820]:
le = LabelEncoder()
classes = dict()
cat = df_test.select_dtypes(exclude = np.number).columns
for i in cat:
    df_test[i] = le.fit_transform(df_test[i])
    classes[i] = le.classes_
df_test

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height,GPU_Ghz,Cpu_Ghz
479,15,3,5,31,8,10,4,1.05,89084.160,4,256,0,1,1920,1080,20,2.5
1022,6,3,5,31,8,10,4,1.11,61218.720,4,256,0,1,1920,1080,20,2.5
298,9,3,9,0,6,6,4,2.40,26586.720,2,1,1,1,1920,1080,9,2.4
1265,9,3,9,40,8,19,4,2.60,47898.720,2,1,1,1,1920,1080,30,2.6
582,6,3,5,24,4,10,4,1.49,38308.320,4,128,0,1,1920,1080,20,2.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
672,9,0,5,31,8,10,4,1.30,55091.520,4,256,0,1,1920,1080,20,2.5
506,2,3,9,43,8,10,4,2.00,65214.720,6,256,0,1,1920,1080,20,2.7
668,15,3,9,31,8,10,4,2.00,67026.240,4,256,0,1,1920,1080,20,2.5
778,13,1,8,46,16,19,4,1.95,154458.720,4,512,0,1,1920,1080,3,2.8


In [3821]:
category = df_train.select_dtypes(exclude = np.number).columns
le = LabelEncoder()
classes = dict()
for i in category:
    df_train[i] = le.fit_transform(df_train[i])
    classes[i] = le.classes_
df_train

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Gpu,OpSys,Weight,Price,Memory_Type,Memory_Capacity,Memory_Unit,Resolution_Category,Width,Height,GPU_Ghz,Cpu_Ghz
1158,10,3,14,45,8,6,5,2.50,42010.7472,3,1,1,1,1920,1080,54,2.30
904,10,3,14,49,8,13,5,1.95,104588.1072,4,512,0,1,1920,1080,27,2.50
439,2,3,14,49,4,13,2,2.00,29783.5200,4,256,0,1,1920,1080,27,2.50
581,4,3,14,51,8,13,5,1.90,53733.9456,1,500,0,2,1366,768,27,2.60
676,2,4,10,49,8,13,5,1.10,60472.8000,4,256,0,1,1920,1080,27,2.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0,3,14,49,8,23,5,2.40,42570.7200,5,256,0,1,1920,1080,38,2.50
1130,7,3,14,65,8,13,5,2.04,33513.1200,1,2,1,2,1366,768,27,2.70
1294,7,3,14,8,6,7,5,2.04,29303.4672,3,1,1,1,1920,1080,59,2.90
860,10,3,14,49,8,13,5,2.30,45323.1648,1,1,1,1,1920,1080,27,2.50


In [3822]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [3823]:
x_train = df_train.drop(columns = 'Price').values
x_train

array([[  10.  ,    3.  ,   14.  , ..., 1080.  ,   54.  ,    2.3 ],
       [  10.  ,    3.  ,   14.  , ..., 1080.  ,   27.  ,    2.5 ],
       [   2.  ,    3.  ,   14.  , ..., 1080.  ,   27.  ,    2.5 ],
       ...,
       [   7.  ,    3.  ,   14.  , ..., 1080.  ,   59.  ,    2.9 ],
       [  10.  ,    3.  ,   14.  , ..., 1080.  ,   27.  ,    2.5 ],
       [  10.  ,    0.  ,    0.  , ..., 1200.  ,   10.  ,    1.44]])

In [3824]:
y_train = df_train['Price'].values
y_train

array([ 42010.7472, 104588.1072,  29783.52  ,  53733.9456,  60472.8   ,
        30310.992 ,  33513.12  ,  61751.52  , 137995.2   ,  31381.92  ,
       130269.6   , 175770.72  ,  65480.5872,  38841.12  ,  46833.12  ,
        13053.0672,  47365.92  ,  58607.4672,  78588.    ,  13445.7408,
        53226.72  ,  63882.72  ,  70809.12  ,  67132.8   ,  15824.16  ,
        47898.72  ,  26533.44  , 135195.336 ,  73473.12  ,  53226.72  ,
        23176.8   ,  12201.12  ,  36496.8   ,  98514.72  ,  29463.84  ,
        34578.72  , 124568.64  ,  14652.    ,  37570.392 ,  25840.8   ,
        62231.04  ,  15877.44  ,  21312.    ,  38308.32  , 149130.72  ,
        15397.92  ,  93932.64  ,  25414.0272,  69210.72  ,  31808.16  ,
        17262.72  , 107257.968 ,  94305.6   ,  45234.72  ,  40972.32  ,
        77788.8   ,  78438.816 ,  14492.16  ,  23373.4032,  79866.72  ,
        12201.12  ,  71874.72  ,  74964.96  , 130001.6016,  88977.6   ,
       103896.    ,  78215.04  ,  42517.44  ,  67399.2   ,  3617

In [3825]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [3826]:
x_test = df_test.drop(columns = 'Price').values

In [3827]:
y_test = df_test['Price'].values

In [3828]:
reg = LinearRegression() 
reg.fit(x_train, y_train)

In [3829]:
y_pred = reg.predict(x_test)

In [3830]:
y_pred[:10]

array([ 64385.95248868,  61679.66410183,  35978.90909092,  62025.68205315,
        46580.58108834,  37274.31779541,  74956.9610243 ,  26160.25923647,
       130003.62546192,  38132.28688075])

In [3831]:
y_test[:10]

array([ 89084.16,  61218.72,  26586.72,  47898.72,  38308.32,  24455.52,
        63456.48,  18594.72, 101178.72,  38681.28])

In [3832]:
np.mean(np.square(y_test - y_pred))

485990692.9410919

In [3833]:
np.mean(np.absolute(y_test - y_pred))

14768.393100520996

In [3834]:
y_mean = np.mean(y_test)
TSS = np.sum(np.square(y_test - y_mean))
RSS = np.sum(np.square(y_test - y_pred))
1- RSS/ TSS

0.6850853535458628

In [3835]:

r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred)


(0.6850853535458628, 485990692.9410919, 14768.393100520996)

In [3836]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(x_train, y_train)
y_pred_ridge = ridge_reg.predict(x_test)

In [3837]:

r2_score(y_test, y_pred_ridge), mean_squared_error(y_test, y_pred_ridge), mean_absolute_error(y_test, y_pred_ridge)


(0.6850052599112422, 486114297.10313296, 14763.153708379323)

In [3838]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

lasso = Lasso(alpha=0.1)  # You might need to adjust alpha based on your specific dataset
lasso.fit(X_train_scaled, y_train)

In [3840]:
y_pred = lasso.predict(X_test_scaled)

In [3841]:

r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred)


(0.6850797086359554, 485999404.4243727, 14768.312385765372)