In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MultiLabelBinarizer

pd.pandas.set_option('display.max_columns',None)

In [2]:
df = pd.read_csv('laptop_price.csv',encoding='latin-1')

In [3]:
df.shape


(1303, 13)

In [4]:
df.head(10)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6
5,6,Acer,Aspire 3,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.1kg,400.0
6,7,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04kg,2139.97
7,8,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,256GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,1158.7
8,9,Asus,ZenBook UX430UN,Ultrabook,14.0,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Nvidia GeForce MX150,Windows 10,1.3kg,1495.0
9,10,Acer,Swift 3,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.6kg,770.0


In [5]:
df = df.drop(['laptop_ID', 'Product'], axis=1)

In [6]:
df.sample(5)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
205,Lenovo,Gaming,15.6,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,512GB SSD,Nvidia GeForce GTX 1060,No OS,2.4kg,1398.0
608,Lenovo,2 in 1 Convertible,15.6,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,256GB SSD,Intel HD Graphics 630,Windows 10,2kg,1299.0
323,HP,Notebook,15.6,IPS Panel Full HD 1366x768,Intel Core i7 8550U 1.8GHz,8GB,1TB HDD,Intel UHD Graphics 620,Windows 10,2.1kg,902.0
679,Acer,Gaming,15.6,Full HD 1920x1080,Intel Core i5 7300HQ 2.5GHz,16GB,256GB SSD,Nvidia GeForce GTX 1050 Ti,Windows 10,2.5kg,1299.0
369,HP,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.36kg,1750.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1303 non-null   object 
 1   TypeName          1303 non-null   object 
 2   Inches            1303 non-null   float64
 3   ScreenResolution  1303 non-null   object 
 4   Cpu               1303 non-null   object 
 5   Ram               1303 non-null   object 
 6   Memory            1303 non-null   object 
 7   Gpu               1303 non-null   object 
 8   OpSys             1303 non-null   object 
 9   Weight            1303 non-null   object 
 10  Price_euros       1303 non-null   float64
dtypes: float64(2), object(9)
memory usage: 112.1+ KB


In [8]:
df.describe()

Unnamed: 0,Inches,Price_euros
count,1303.0,1303.0
mean,15.017191,1123.686992
std,1.426304,699.009043
min,10.1,174.0
25%,14.0,599.0
50%,15.6,977.0
75%,15.6,1487.88
max,18.4,6099.0


In [9]:
df['Company'].value_counts()


Company
Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        9
Mediacom       7
Razer          7
Microsoft      6
Vero           4
Xiaomi         4
Chuwi          3
Fujitsu        3
Google         3
LG             3
Huawei         2
Name: count, dtype: int64

# One Hot Encoded Company and Typename Column 


In [10]:
counts = df['Company'].value_counts()
threshold = 10
repl = counts[counts <= threshold].index
df['Company'] = df['Company'].replace(repl, 'uncommon')
dummies = pd.get_dummies(df['Company'])
dummies = dummies.astype(int)
df = df.drop('Company', axis=1).join(dummies)

In [11]:
counts1 = df['TypeName'].value_counts()
df = df.join(pd.get_dummies(df.TypeName).astype(int))
df.drop(['TypeName'],axis=1,inplace=True)

In [12]:
df.sample(5)

Unnamed: 0,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation
190,14.0,Touchscreen 2560x1440,Intel Core i7 7500U 2.7GHz,16GB,1TB SSD,Intel HD Graphics 620,Windows 10,1.42kg,2824.0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
436,15.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,4GB,500GB HDD,Intel HD Graphics 400,Windows 10,2.1kg,347.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
288,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows 10,2.5kg,1179.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
777,14.0,IPS Panel Full HD 1920x1080,Intel Core i3 7100U 2.4GHz,4GB,128GB SSD,Intel HD Graphics 620,Windows 10,1.8kg,636.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
403,15.6,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,256GB SSD,Nvidia GeForce GTX 1050,Linux,2.5kg,879.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [13]:
df['ScreenWidth'] = df['ScreenResolution'].str.split('x').apply(lambda x: x[0][-4:]).astype(int)
df['ScreenHeight'] = df['ScreenResolution'].str.split('x').apply(lambda x: x[1]).astype(int)


df.drop(['ScreenResolution'], axis=1, inplace=True)


In [14]:
df.head()

Unnamed: 0,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight
0,13.3,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,2560,1600
1,13.3,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1440,900
2,15.6,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1920,1080
3,15.4,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,2880,1800
4,13.3,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,2560,1600


# Extracted CPU Brand and its frequency 

In [15]:
df['CPU Brand']=df['Cpu'].str.split(" ").apply(lambda x : x[0])
df['CPU Frequency']=df['Cpu'].str.split(" ").apply(lambda x : x[-1])


In [16]:
df.drop(['Cpu'],axis=1,inplace=True)

In [17]:
df.sample(5)

Unnamed: 0,Inches,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight,CPU Brand,CPU Frequency
1264,15.6,2GB,500GB HDD,Intel HD Graphics,Windows 10,2.20kg,379.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1366,768,Intel,1.6GHz
1047,17.3,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 980M,Windows 10,3.78kg,1545.64,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1920,1080,Intel,2.7GHz
581,15.6,8GB,500GB HDD,Intel HD Graphics 620,Windows 10,1.9kg,1008.52,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1366,768,Intel,2.6GHz
107,14.0,8GB,256GB SSD,Nvidia GeForce 940MX,Windows 10,1.3kg,1193.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1920,1080,Intel,2.7GHz
555,15.6,4GB,500GB HDD,Intel HD Graphics 500,Linux,2kg,224.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1366,768,Intel,1.1GHz


In [18]:
df['CPU Frequency'] = df['CPU Frequency'].str[:-3]

In [19]:
df['Ram'] = df['Ram'].str.replace('GB', '').astype(int)
df.sample(5)

Unnamed: 0,Inches,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight,CPU Brand,CPU Frequency
483,15.6,4,64GB Flash Storage,Intel HD Graphics,Windows 10,1.89kg,248.9,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1920,1080,Intel,1.44
188,13.3,8,256GB SSD,Intel HD Graphics 615,Windows 10,1.12kg,989.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1920,1080,Intel,1.2
1242,15.6,4,500GB HDD,Intel HD Graphics 520,Linux,2.4kg,361.8,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1366,768,Intel,2.0
536,15.6,8,256GB SSD,AMD Radeon 530,Windows 10,2.2kg,797.41,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1920,1080,Intel,1.6
1247,15.6,16,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1070,Windows 10,2.34kg,2325.0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1920,1080,Intel,2.6


In [20]:
df['CPU Frequency'] = df['CPU Frequency'].astype(float)
df.sample(5)

Unnamed: 0,Inches,Ram,Memory,Gpu,OpSys,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight,CPU Brand,CPU Frequency
955,17.3,16,512GB SSD + 1TB HDD,Nvidia GeForce GTX 1070,Windows 10,4.36kg,3154.0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,3840,2160,Intel,2.8
348,11.6,4,32GB Flash Storage,Intel HD Graphics 500,Windows 10,1.5kg,375.0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1366,768,Intel,1.1
1014,13.3,4,500GB HDD,Intel HD Graphics 620,Windows 10,1.49kg,800.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1366,768,Intel,2.4
135,14.0,8,256GB SSD,Intel UHD Graphics 620,Windows 10,1.63kg,988.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1920,1080,Intel,1.8
435,17.3,8,256GB SSD + 1TB HDD,AMD Radeon RX 580,Windows 10,3.2kg,1695.0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1920,1080,AMD,3.2


In [21]:
df['Memory Amount'] = df.Memory.str.split(" ").apply(lambda x : x[0])
df['Memory Type'] = df.Memory.str.split(" ").apply(lambda x : x[1])
df['Memory Type'].value_counts()



Memory Type
SSD       843
HDD       375
Flash      75
Hybrid     10
Name: count, dtype: int64

In [22]:
def turn_memory_into_MB(value):
    if "GB" in value:
        return float(value[:value.find("GB")]) * 1000
    elif "TB" in value:
        return float(value[:value.find("TB")]) * 1000000

In [23]:
df["Memory Amount"] = df["Memory Amount"].apply(turn_memory_into_MB)

In [24]:
df.drop(['Memory'],axis=1,inplace=True)

# Fixing Weight Column

In [25]:
df['Weight'] = df['Weight'].astype(str)
df['Weight'] = df['Weight'].str.replace('kg', '', regex=False)
df['Weight'] = df['Weight'].replace('', np.nan)
df['Weight'] = df['Weight'].astype(float)
df.sample(5)

Unnamed: 0,Inches,Ram,Gpu,OpSys,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight,CPU Brand,CPU Frequency,Memory Amount,Memory Type
482,13.3,4,Intel UHD Graphics 620,Windows 10,1.49,726.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1920,1080,Intel,1.6,128000.0,SSD
1129,17.3,8,Nvidia GeForce GTX 1060,Windows 10,3.35,1129.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1920,1080,Intel,2.3,128000.0,SSD
1075,17.3,8,AMD Radeon R5 M330,Windows 10,3.0,659.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1600,900,Intel,2.3,1000000.0,HDD
350,15.6,16,Nvidia GeForce GTX 1050,Windows 10,2.06,2027.42,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,3840,2160,Intel,2.8,512000.0,SSD
706,13.3,8,Intel UHD Graphics 620,Windows 10,1.6,999.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1920,1080,Intel,1.6,256000.0,SSD


# One Hot Encoding OpSys

In [26]:
df=df.join(pd.get_dummies(df.OpSys).astype(int))
df.drop(['OpSys'],axis=1,inplace=True)

# Fixing GPU Column

In [27]:
df['GPU Brand']=df.Gpu.str.split(" ").apply(lambda x :x[0])
df.drop(['Gpu'],axis=1,inplace=True)

In [28]:
df.sample(5)

Unnamed: 0,Inches,Ram,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight,CPU Brand,CPU Frequency,Memory Amount,Memory Type,Android,Chrome OS,Linux,Mac OS X,No OS,Windows 10,Windows 10 S,Windows 7,macOS,GPU Brand
731,15.6,12,2.25,649.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1366,768,Intel,2.5,1000000.0,HDD,0,0,0,0,0,1,0,0,0,Intel
880,13.3,4,1.28,1700.0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1920,1080,Intel,2.5,256000.0,SSD,0,0,0,0,0,1,0,0,0,Intel
405,14.0,8,1.1,1873.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1920,1080,Intel,2.7,512000.0,SSD,0,0,0,0,0,1,0,0,0,Intel
303,15.6,8,2.2,549.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1366,768,Intel,2.5,2000000.0,HDD,0,0,0,0,1,0,0,0,0,Nvidia
632,15.6,4,1.8,829.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1920,1080,Intel,1.6,256000.0,SSD,0,0,0,0,0,1,0,0,0,Intel


In [29]:
cpu_categories = pd.get_dummies(df["CPU Brand"]).astype(int)
cpu_categories.columns = [col + "_CPU" for col in cpu_categories.columns]

df = df.join(cpu_categories)
df = df.drop("CPU Brand", axis=1)

In [30]:
gpu_categories = pd.get_dummies(df["GPU Brand"]).astype(int)
gpu_categories.columns = [col + "_GPU" for col in gpu_categories.columns]

df = df.join(gpu_categories)
df = df.drop("GPU Brand", axis=1)
df.sample(5)


Unnamed: 0,Inches,Ram,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight,CPU Frequency,Memory Amount,Memory Type,Android,Chrome OS,Linux,Mac OS X,No OS,Windows 10,Windows 10 S,Windows 7,macOS,AMD_CPU,Intel_CPU,Samsung_CPU,AMD_GPU,ARM_GPU,Intel_GPU,Nvidia_GPU
638,13.3,8,1.37,1757.42,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1920,1080,2.7,256000.0,SSD,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
90,15.6,8,2.45,699.0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1920,1080,3.0,1000000.0,HDD,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0
463,14.0,8,1.5,799.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1920,1080,2.7,512000.0,SSD,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0
118,17.3,4,2.0,564.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1920,1080,2.0,1000000.0,HDD,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1
854,15.6,8,2.3,787.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1920,1080,2.7,256000.0,SSD,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1


In [31]:
df=df.join(pd.get_dummies(df['Memory Type']).astype(int))

In [32]:
df.drop(['Memory Type'],axis=1,inplace=True)

In [33]:
df.sample(5)


Unnamed: 0,Inches,Ram,Weight,Price_euros,Acer,Apple,Asus,Dell,HP,Lenovo,MSI,Toshiba,uncommon,2 in 1 Convertible,Gaming,Netbook,Notebook,Ultrabook,Workstation,ScreenWidth,ScreenHeight,CPU Frequency,Memory Amount,Android,Chrome OS,Linux,Mac OS X,No OS,Windows 10,Windows 10 S,Windows 7,macOS,AMD_CPU,Intel_CPU,Samsung_CPU,AMD_GPU,ARM_GPU,Intel_GPU,Nvidia_GPU,Flash,HDD,Hybrid,SSD
1288,15.6,4,2.2,369.0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1366,768,1.6,500000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0
783,15.6,8,2.2,329.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1366,768,1.6,1000000.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
30,15.6,4,1.89,244.99,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1920,1080,1.44,64000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0
982,15.6,6,2.2,549.0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1920,1080,3.6,256000.0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1
1206,15.6,8,1.91,579.0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1366,768,2.5,256000.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1


# Feature Selection

In [34]:
from ydata_profiling import ProfileReport
PR = ProfileReport(df)
PR.to_file("Output.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/43 [00:00<?, ?it/s][A
 16%|█████████████▌                                                                     | 7/43 [00:00<00:00, 40.79it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 51.21it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(18, 15))

# Select numeric columns only for correlation
numeric_df = df.select_dtypes(include=['number'])

sns.heatmap(numeric_df.corr(), annot=True, cmap="YlGnBu")
plt.show()


  plt.show()


In [36]:
target_correlations = numeric_df.corr()['Price_euros'].apply(abs).sort_values()

In [37]:
target_correlations

Asus                  0.010376
Mac OS X              0.015656
Windows 10 S          0.018312
Samsung_CPU           0.018431
ARM_GPU               0.018431
Lenovo                0.029007
Android               0.038700
Hybrid                0.039386
Toshiba               0.040339
HP                    0.041291
Dell                  0.048509
Inches                0.068197
2 in 1 Convertible    0.072675
uncommon              0.075249
Apple                 0.080688
macOS                 0.089928
Netbook               0.097572
Chrome OS             0.118684
Memory Amount         0.122365
Windows 10            0.137048
Windows 7             0.152381
Linux                 0.162060
No OS                 0.177094
MSI                   0.180100
AMD_CPU               0.180111
Intel_CPU             0.181127
Intel_GPU             0.184205
AMD_GPU               0.199415
Acer                  0.208349
Weight                0.210370
Flash                 0.210823
Workstation           0.249752
Ultraboo

In [38]:
selected_features = target_correlations[0:].index

In [39]:
selected_features = list(selected_features)

In [40]:
selected_features

['Asus',
 'Mac OS X',
 'Windows 10 S',
 'Samsung_CPU',
 'ARM_GPU',
 'Lenovo',
 'Android',
 'Hybrid',
 'Toshiba',
 'HP',
 'Dell',
 'Inches',
 '2 in 1 Convertible',
 'uncommon',
 'Apple',
 'macOS',
 'Netbook',
 'Chrome OS',
 'Memory Amount',
 'Windows 10',
 'Windows 7',
 'Linux',
 'No OS',
 'MSI',
 'AMD_CPU',
 'Intel_CPU',
 'Intel_GPU',
 'AMD_GPU',
 'Acer',
 'Weight',
 'Flash',
 'Workstation',
 'Ultrabook',
 'Nvidia_GPU',
 'Gaming',
 'HDD',
 'CPU Frequency',
 'SSD',
 'Notebook',
 'ScreenHeight',
 'ScreenWidth',
 'Ram',
 'Price_euros']

In [41]:
limited_df = df[selected_features]

In [42]:
plt.figure(figsize=(18, 15))
sns.heatmap(limited_df.corr(), annot=True, cmap="YlGnBu")

<Axes: >

# Model  Training

In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X, y = limited_df.drop("Price_euros", axis=1), limited_df["Price_euros"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
forest = RandomForestRegressor()

forest.fit(X_train_scaled, y_train)

In [45]:
forest.score(X_test_scaled, y_test)

0.7812349913742012

In [46]:
y_pred = forest.predict(X_test_scaled)

In [47]:
plt.figure(figsize=(12, 8))
plt.scatter(y_pred, y_test)
plt.plot(range(0, 6000), range(0, 6000), c="red")

[<matplotlib.lines.Line2D at 0x1f486390340>]

In [48]:
scores = cross_val_score(forest, X, y, cv=5)
print("Cross-validation scores:", scores)
print("Average score:", scores.mean())

Cross-validation scores: [0.83467807 0.82186115 0.81288786 0.76321465 0.76801556]
Average score: 0.8001314557580061


# Comparing With XGBoost

In [49]:

from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=5,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)

xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)


In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)


xgb_rmse_scores = -cross_val_score(xgb_model(), X_train, y_train, cv=5, scoring=rmse_scorer)
xgb_r2_scores = cross_val_score(xgb_model(), X_train, y_train, cv=5, scoring='r2')

print("🔷 XGBoost CV RMSE Scores:", xgb_rmse_scores)
print("🔷 XGBoost Average CV RMSE:", xgb_rmse_scores.mean())
print("🔷 XGBoost Average CV R²:", xgb_r2_scores.mean())


rf_rmse_scores = -cross_val_score(forest, X_train, y_train, cv=5, scoring=rmse_scorer)
rf_r2_scores = cross_val_score(forest, X_train, y_train, cv=5, scoring='r2')

print("🟢 Random Forest CV RMSE Scores:", rf_rmse_scores)
print("🟢 Random Forest Average CV RMSE:", rf_rmse_scores.mean())
print("🟢 Random Forest Average CV R²:", rf_r2_scores.mean())


NameError: name 'xgb_model' is not defined

# Clearly XGBoost is More Accurate