In [183]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, minmax_scale
import numpy as np

In [184]:
df = pd.read_csv('laptopData.csv')

In [217]:
print(df.isnull().sum())


Company                 0
TypeName                0
Inches                  0
OpSys                   0
Price                   0
ScreenType              0
Resolution              0
CpuCompanyGeneration    0
CpuHz                   0
GpuBrandModel           0
GpuSize                 0
MemorySize              1
MemoryUnit              1
MemoryType              0
WeightValue             1
WeightUnit              1
RamSize                 0
RamUnit                 0
dtype: int64


In [218]:
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [186]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [221]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [220]:
print(df.isnull().sum())


Company                 0
TypeName                0
Inches                  0
OpSys                   0
Price                   0
ScreenType              0
Resolution              0
CpuCompanyGeneration    0
CpuHz                   0
GpuBrandModel           0
GpuSize                 0
MemorySize              0
MemoryUnit              0
MemoryType              0
WeightValue             0
WeightUnit              0
RamSize                 0
RamUnit                 0
dtype: int64


In [187]:
def extract_screen_resolution(res):
    parts = res.split()
    screen_type = ' '.join(parts[:-1])  # Assumes the format is like "IPS Panel 1920x1080"
    resolution = parts[-1]
    return screen_type, resolution

In [188]:
screen_resolution_df = df['ScreenResolution'].apply(lambda x: pd.Series(extract_screen_resolution(x)))
screen_resolution_df.columns = ['ScreenType', 'Resolution']

In [189]:
df = pd.concat([df, screen_resolution_df], axis=1)


In [190]:
df.drop('ScreenResolution', axis=1, inplace=True)


In [191]:
df

Unnamed: 0,Company,TypeName,Inches,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,ScreenType,Resolution
0,Apple,Ultrabook,13.3,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832,IPS Panel Retina Display,2560x1600
1,Apple,Ultrabook,13.3,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232,,1440x900
2,HP,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0000,Full HD,1920x1080
3,Apple,Ultrabook,15.4,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.3360,IPS Panel Retina Display,2880x1800
4,Apple,Ultrabook,13.3,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.8080,IPS Panel Retina Display,2560x1600
...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,14,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.6400,IPS Panel Full HD / Touchscreen,1920x1080
1299,Lenovo,2 in 1 Convertible,13.3,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.7200,IPS Panel Quad HD+ / Touchscreen,3200x1800
1300,Lenovo,Notebook,14,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.1200,,1366x768
1301,HP,Notebook,15.6,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.9200,,1366x768


In [192]:
def extract_cpu(cpu):
    parts = cpu.split()
    company_generation = ' '.join(parts[:3])  # Assumes the format is like "Intel Core i5 ..."
    hz = parts[-1].replace('GHz', '').strip()
    return company_generation, float(hz)

In [193]:
cpu_df = df['Cpu'].apply(lambda x: pd.Series(extract_cpu(x)))
cpu_df.columns = ['CpuCompanyGeneration', 'CpuHz']

In [194]:
df = pd.concat([df, cpu_df], axis=1)


In [195]:
df.drop('Cpu', axis=1, inplace=True)


In [196]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,Memory,Gpu,OpSys,Weight,Price,ScreenType,Resolution,CpuCompanyGeneration,CpuHz
0,Apple,Ultrabook,13.3,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832,IPS Panel Retina Display,2560x1600,Intel Core i5,2.3
1,Apple,Ultrabook,13.3,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232,,1440x900,Intel Core i5,1.8
2,HP,Notebook,15.6,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0,Full HD,1920x1080,Intel Core i5,2.5
3,Apple,Ultrabook,15.4,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336,IPS Panel Retina Display,2880x1800,Intel Core i7,2.7
4,Apple,Ultrabook,13.3,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808,IPS Panel Retina Display,2560x1600,Intel Core i5,3.1


In [197]:
def extract_gpu(gpu):
    parts = gpu.split()
    gpu_brand_model = ' '.join(parts[:-1])  # Assumes the format is like "Intel Iris Plus Graphics 640"
    gpu_size = parts[-1]
    return gpu_brand_model, gpu_size

In [198]:
gpu_df = df['Gpu'].apply(lambda x: pd.Series(extract_gpu(x)))
gpu_df.columns = ['GpuBrandModel', 'GpuSize']

In [199]:
df = pd.concat([df, gpu_df], axis=1)


In [200]:
df.drop('Gpu', axis=1, inplace=True)


In [201]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,Memory,OpSys,Weight,Price,ScreenType,Resolution,CpuCompanyGeneration,CpuHz,GpuBrandModel,GpuSize
0,Apple,Ultrabook,13.3,8GB,128GB SSD,macOS,1.37kg,71378.6832,IPS Panel Retina Display,2560x1600,Intel Core i5,2.3,Intel Iris Plus Graphics,640
1,Apple,Ultrabook,13.3,8GB,128GB Flash Storage,macOS,1.34kg,47895.5232,,1440x900,Intel Core i5,1.8,Intel HD Graphics,6000
2,HP,Notebook,15.6,8GB,256GB SSD,No OS,1.86kg,30636.0,Full HD,1920x1080,Intel Core i5,2.5,Intel HD Graphics,620
3,Apple,Ultrabook,15.4,16GB,512GB SSD,macOS,1.83kg,135195.336,IPS Panel Retina Display,2880x1800,Intel Core i7,2.7,AMD Radeon Pro,455
4,Apple,Ultrabook,13.3,8GB,256GB SSD,macOS,1.37kg,96095.808,IPS Panel Retina Display,2560x1600,Intel Core i5,3.1,Intel Iris Plus Graphics,650


In [202]:
import re

def split_memory(mem):
    size = None
    unit = None
    mem_type = None
    
    # Find the numeric part and convert to float
    size_match = re.search(r'\d+(\.\d+)?', mem)
    if size_match:
        size = float(size_match.group())

    # Find the unit (GB or TB)
    if 'TB' in mem:
        unit = 'TB'
    elif 'GB' in mem:
        unit = 'GB'
    
    # Find the memory type (SSD, HDD, Hybrid)
    if 'SSD' in mem:
        mem_type = 'SSD'
    elif 'HDD' in mem:
        mem_type = 'HDD'
    elif 'Hybrid' in mem:
        mem_type = 'Hybrid'
    else:
        mem_type = 'Unknown'

    return size, unit, mem_type

In [203]:
memory_df = df['Memory'].apply(lambda x: pd.Series(split_memory(x)))
memory_df.columns = ['MemorySize', 'MemoryUnit', 'MemoryType']

In [204]:
df = pd.concat([df, memory_df], axis=1)


In [205]:
df.drop('Memory', axis=1, inplace=True)


In [206]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Weight,Price,ScreenType,Resolution,CpuCompanyGeneration,CpuHz,GpuBrandModel,GpuSize,MemorySize,MemoryUnit,MemoryType
0,Apple,Ultrabook,13.3,8GB,macOS,1.37kg,71378.6832,IPS Panel Retina Display,2560x1600,Intel Core i5,2.3,Intel Iris Plus Graphics,640,128.0,GB,SSD
1,Apple,Ultrabook,13.3,8GB,macOS,1.34kg,47895.5232,,1440x900,Intel Core i5,1.8,Intel HD Graphics,6000,128.0,GB,Unknown
2,HP,Notebook,15.6,8GB,No OS,1.86kg,30636.0,Full HD,1920x1080,Intel Core i5,2.5,Intel HD Graphics,620,256.0,GB,SSD
3,Apple,Ultrabook,15.4,16GB,macOS,1.83kg,135195.336,IPS Panel Retina Display,2880x1800,Intel Core i7,2.7,AMD Radeon Pro,455,512.0,GB,SSD
4,Apple,Ultrabook,13.3,8GB,macOS,1.37kg,96095.808,IPS Panel Retina Display,2560x1600,Intel Core i5,3.1,Intel Iris Plus Graphics,650,256.0,GB,SSD


In [207]:
def split_weight(weight):
    size_match = re.search(r'\d+(\.\d+)?', weight)
    size = float(size_match.group()) if size_match else None
    unit = weight.replace(size_match.group(), '').strip() if size_match else None
    return size, unit


In [208]:
weight_df = df['Weight'].apply(lambda x: pd.Series(split_weight(x)))
weight_df.columns = ['WeightValue', 'WeightUnit']

In [209]:
df = pd.concat([df, weight_df], axis=1)


In [210]:
df.drop('Weight', axis=1, inplace=True)


In [211]:
df.head()

Unnamed: 0,Company,TypeName,Inches,Ram,OpSys,Price,ScreenType,Resolution,CpuCompanyGeneration,CpuHz,GpuBrandModel,GpuSize,MemorySize,MemoryUnit,MemoryType,WeightValue,WeightUnit
0,Apple,Ultrabook,13.3,8GB,macOS,71378.6832,IPS Panel Retina Display,2560x1600,Intel Core i5,2.3,Intel Iris Plus Graphics,640,128.0,GB,SSD,1.37,kg
1,Apple,Ultrabook,13.3,8GB,macOS,47895.5232,,1440x900,Intel Core i5,1.8,Intel HD Graphics,6000,128.0,GB,Unknown,1.34,kg
2,HP,Notebook,15.6,8GB,No OS,30636.0,Full HD,1920x1080,Intel Core i5,2.5,Intel HD Graphics,620,256.0,GB,SSD,1.86,kg
3,Apple,Ultrabook,15.4,16GB,macOS,135195.336,IPS Panel Retina Display,2880x1800,Intel Core i7,2.7,AMD Radeon Pro,455,512.0,GB,SSD,1.83,kg
4,Apple,Ultrabook,13.3,8GB,macOS,96095.808,IPS Panel Retina Display,2560x1600,Intel Core i5,3.1,Intel Iris Plus Graphics,650,256.0,GB,SSD,1.37,kg


In [212]:
def split_ram(ram):
    size_match = re.search(r'\d+(\.\d+)?', ram)
    size = float(size_match.group()) if size_match else None
    unit = ram.replace(size_match.group(), '').strip() if size_match else None
    return size, unit

In [213]:
ram_df = df['Ram'].apply(lambda x: pd.Series(split_ram(x)))
ram_df.columns = ['RamSize', 'RamUnit']

In [214]:
df = pd.concat([df, ram_df], axis=1)


In [215]:
df.drop('Ram', axis=1, inplace=True)


In [216]:
df.head()

Unnamed: 0,Company,TypeName,Inches,OpSys,Price,ScreenType,Resolution,CpuCompanyGeneration,CpuHz,GpuBrandModel,GpuSize,MemorySize,MemoryUnit,MemoryType,WeightValue,WeightUnit,RamSize,RamUnit
0,Apple,Ultrabook,13.3,macOS,71378.6832,IPS Panel Retina Display,2560x1600,Intel Core i5,2.3,Intel Iris Plus Graphics,640,128.0,GB,SSD,1.37,kg,8.0,GB
1,Apple,Ultrabook,13.3,macOS,47895.5232,,1440x900,Intel Core i5,1.8,Intel HD Graphics,6000,128.0,GB,Unknown,1.34,kg,8.0,GB
2,HP,Notebook,15.6,No OS,30636.0,Full HD,1920x1080,Intel Core i5,2.5,Intel HD Graphics,620,256.0,GB,SSD,1.86,kg,8.0,GB
3,Apple,Ultrabook,15.4,macOS,135195.336,IPS Panel Retina Display,2880x1800,Intel Core i7,2.7,AMD Radeon Pro,455,512.0,GB,SSD,1.83,kg,16.0,GB
4,Apple,Ultrabook,13.3,macOS,96095.808,IPS Panel Retina Display,2560x1600,Intel Core i5,3.1,Intel Iris Plus Graphics,650,256.0,GB,SSD,1.37,kg,8.0,GB


In [222]:
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns


In [223]:
df[numeric_cols] = minmax_scale(df[numeric_cols])


In [224]:
label_encoder = LabelEncoder()


In [225]:
for col in df.select_dtypes(include=['object']).columns:
    df[col] = label_encoder.fit_transform(df[col])

In [227]:
df.head()


Unnamed: 0,Company,TypeName,Inches,OpSys,Price,ScreenType,Resolution,CpuCompanyGeneration,CpuHz,GpuBrandModel,GpuSize,MemorySize,MemoryUnit,MemoryType,WeightValue,WeightUnit,RamSize,RamUnit
0,1,4,7,8,0.196741,12,10,33,0.518519,16,29,0.248532,0,2,0.123408,0,0.111111,0
1,1,4,7,8,0.122353,0,1,33,0.333333,14,25,0.248532,0,3,0.120705,0,0.111111,0
2,7,3,14,4,0.067679,3,3,33,0.592593,14,27,0.499022,0,2,0.167553,0,0.111111,0
3,1,4,13,8,0.398895,12,12,34,0.666667,4,12,1.0,0,2,0.16485,0,0.238095,0
4,1,4,7,8,0.275038,12,10,33,0.814815,16,30,0.499022,0,2,0.123408,0,0.111111,0
