In [24]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns   
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# The above codes is for me to import libraries that I would be needing for data analysis, preprocessing, visualization, ML modeling, and evaluation

In [25]:
df = pd.read_csv('laptop_Price.csv', encoding='latin1')

# This helps me load the dataframe, df

In [26]:
# # To convert categorical features to numerical representations;

# #  Label Encoding for ordinal categorical features;
label_encoder = LabelEncoder()
ordinal_features = ['Ram']  # Add other ordinal features if there is any

for feature in ordinal_features:
     if feature in df.columns:
         df[feature] = label_encoder.fit_transform(df[feature])
         print(f"Label encoded {feature}: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")


df.head()

# This helps me get the mapping dictionary for Ram, and also
# To dislay the first 5 rows of the dataset with encoded RAM.

Label encoded Ram: {'12GB': np.int64(0), '16GB': np.int64(1), '24GB': np.int64(2), '2GB': np.int64(3), '32GB': np.int64(4), '4GB': np.int64(5), '64GB': np.int64(6), '6GB': np.int64(7), '8GB': np.int64(8)}


Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [27]:
#To convert all columns names to lower case for easy cleaning and future processing;
df.columns = df.columns.str.lower()
df.columns

#  This converts all column names to lowercase. 

Index(['laptop_id', 'company', 'product', 'typename', 'inches',
       'screenresolution', 'cpu', 'ram', 'memory', 'gpu', 'opsys', 'weight',
       'price_euros'],
      dtype='object')

In [28]:
# 1. CPU-Related Features;
df['cpu_brand'] = df['cpu'].str.extract(r'(Intel|AMD)')
df['cpu_speed_ghz'] = df['cpu'].str.extract(r'(\d+\.\d+)GHz').astype(float)
df['cpu_generation'] = df['cpu'].str.extract(r'(\d+)th').fillna(df['cpu'].str.extract(r'(\d+)\w+U'))
df['cpu_type'] = df['cpu'].str.extract(r'(i3|i5|i7|i9|Ryzen|Celeron|Pentium|Xeon)')

df.head()

# This extracts CPU-related features, giving us the first five rows with new CPU columns 

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,opsys,weight,price_euros,cpu_brand,cpu_speed_ghz,cpu_generation,cpu_type
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,Intel,2.3,,i5
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,Intel,1.8,,i5
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,Intel,2.5,720.0,i5
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,Intel,2.7,,i7
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,Intel,3.1,,i5


In [29]:
#  GPU-Related Features;
df['gpu_brand'] = df['gpu'].str.extract(r'(Intel|AMD|NVIDIA)')
df['gpu_type'] = df['gpu'].str.extract(r'(GTX|RTX|Radeon|Iris|HD|UHD|MX)')
df['gpu_is_dedicated'] = df['gpu'].str.contains('GTX|RTX|Radeon|GeForce').astype(int)

df.head()

# This extracts GPU-related features, and gives the first fuve rows with new GPU columns

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,opsys,weight,price_euros,cpu_brand,cpu_speed_ghz,cpu_generation,cpu_type,gpu_brand,gpu_type,gpu_is_dedicated
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69,Intel,2.3,,i5,Intel,Iris,0
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94,Intel,1.8,,i5,Intel,HD,0
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0,Intel,2.5,720.0,i5,Intel,HD,0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45,Intel,2.7,,i7,AMD,Radeon,1
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6,Intel,3.1,,i5,Intel,Iris,0


In [30]:
#  Screen Resolution Features;
df['resolution_width'] = df['screenresolution'].str.extract(r'(\d+)x').astype(float)
df['resolution_height'] = df['screenresolution'].str.extract(r'x(\d+)').astype(float)
df['resolution_pixels'] = df['resolution_width'] * df['resolution_height']
df['screen_type'] = df['screenresolution'].str.extract(r'(IPS|Retina|Touchscreen|Full HD|4K)')
df['is_touchscreen'] = df['screenresolution'].str.contains('Touchscreen').astype(int)

df.head()

# It extracts screen features, and displays the first 5 rows with new screen column

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,...,cpu_generation,cpu_type,gpu_brand,gpu_type,gpu_is_dedicated,resolution_width,resolution_height,resolution_pixels,screen_type,is_touchscreen
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,,i5,Intel,Iris,0,2560.0,1600.0,4096000.0,IPS,0
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,,i5,Intel,HD,0,1440.0,900.0,1296000.0,,0
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,720.0,i5,Intel,HD,0,1920.0,1080.0,2073600.0,Full HD,0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,...,,i7,AMD,Radeon,1,2880.0,1800.0,5184000.0,IPS,0
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,,i5,Intel,Iris,0,2560.0,1600.0,4096000.0,IPS,0


In [31]:

print(f"Memory column dtype: {df['memory'].dtype}")
print(f"Sample memory values: {df['memory'].head()}")

memory_str = df['memory'].astype(str)  

# To work on memory Features, using the string representation;
df['has_ssd'] = memory_str.str.contains('SSD').astype(int)
df['has_hdd'] = memory_str.str.contains('HDD').astype(int)
df['has_flash_storage'] = memory_str.str.contains('Flash').astype(int)
df['memory_type'] = memory_str.str.extract(r'(SSD|HDD|Flash|Hybrid)')

df.head()

# This extracts memory/storage features, prints memory column datatype and some sample values, and displays the first 5 rows with new memory columns.

Memory column dtype: object
Sample memory values: 0              128GB SSD
1    128GB Flash Storage
2              256GB SSD
3              512GB SSD
4              256GB SSD
Name: memory, dtype: object


Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,...,gpu_is_dedicated,resolution_width,resolution_height,resolution_pixels,screen_type,is_touchscreen,has_ssd,has_hdd,has_flash_storage,memory_type
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,0,2560.0,1600.0,4096000.0,IPS,0,1,0,0,SSD
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,0,1440.0,900.0,1296000.0,,0,0,0,1,Flash
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,0,1920.0,1080.0,2073600.0,Full HD,0,1,0,0,SSD
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,...,1,2880.0,1800.0,5184000.0,IPS,0,1,0,0,SSD
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,0,2560.0,1600.0,4096000.0,IPS,0,1,0,0,SSD


In [32]:
#  Operating System Features;
df['os_type'] = df['opsys'].str.extract(r'(Windows|macOS|Linux|No OS|Chrome)')
df['is_windows'] = df['opsys'].str.contains('Windows').astype(int)
df['is_macos'] = df['opsys'].str.contains('macOS').astype(int)

df.head()

# this extracts OS features, and displays the first 5 rows of the new OS column

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,...,resolution_pixels,screen_type,is_touchscreen,has_ssd,has_hdd,has_flash_storage,memory_type,os_type,is_windows,is_macos
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,4096000.0,IPS,0,1,0,0,SSD,macOS,0,1
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,1296000.0,,0,0,0,1,Flash,macOS,0,1
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,2073600.0,Full HD,0,1,0,0,SSD,No OS,0,0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,...,5184000.0,IPS,0,1,0,0,SSD,macOS,0,1
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,4096000.0,IPS,0,1,0,0,SSD,macOS,0,1


In [33]:
# First, I convert the weight column to numeric, coercing errors to NaN;
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')

# Now, I introduced use pd.cut();
df['weight_category'] = pd.cut(df['weight'], 
                              bins=[0, 1.5, 2, 2.5, 3, 4, 10],
                              labels=['Ultra-light', 'Light', 'Medium', 'Heavy', 'Very Heavy', 'Extreme'],
                              include_lowest=True)

df.head()

# This converts weight to numeric and categorizes into groups, and gives us the first 5 rows of the weight_category

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,...,screen_type,is_touchscreen,has_ssd,has_hdd,has_flash_storage,memory_type,os_type,is_windows,is_macos,weight_category
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,IPS,0,1,0,0,SSD,macOS,0,1,
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,,0,0,0,1,Flash,macOS,0,1,
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,Full HD,0,1,0,0,SSD,No OS,0,0,
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,...,IPS,0,1,0,0,SSD,macOS,0,1,
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,IPS,0,1,0,0,SSD,macOS,0,1,


In [34]:
#  Brand and Product Features;
df['is_premium_brand'] = df['company'].isin(['Apple', 'Dell', 'HP', 'Lenovo']).astype(int)
df['product_line'] = df['product'].str.extract(r'(\w+)').fillna('Unknown')

df.head()

# This creates brand/product features, and displays the first 5 rows with new Brand/Product column

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,...,has_ssd,has_hdd,has_flash_storage,memory_type,os_type,is_windows,is_macos,weight_category,is_premium_brand,product_line
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,1,0,0,SSD,macOS,0,1,,1,MacBook
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,0,0,1,Flash,macOS,0,1,,1,Macbook
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,1,0,0,SSD,No OS,0,0,,1,250
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,...,1,0,0,SSD,macOS,0,1,,1,MacBook
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,1,0,0,SSD,macOS,0,1,,1,MacBook


In [35]:
# Performance Score Features;

# First, I want to ensure all columns are numeric;
df['cpu_speed_ghz'] = pd.to_numeric(df['cpu_speed_ghz'], errors='coerce')
df['ram'] = pd.to_numeric(df['ram'], errors='coerce')
df['cpu_generation'] = pd.to_numeric(df['cpu_generation'], errors='coerce')

# I'm filling NaN values with appropriate defaults;
df['cpu_speed_ghz'] = df['cpu_speed_ghz'].fillna(df['cpu_speed_ghz'].median())
df['ram'] = df['ram'].fillna(df['ram'].median())
df['cpu_generation'] = df['cpu_generation'].fillna(1)  # Change to 1 if generation is missing

# To create the performance score;
df['cpu_performance_score'] = df['cpu_speed_ghz'] * df['ram'] * df['cpu_generation']


print("Performance score created successfully!")
print(df[['cpu_speed_ghz', 'ram', 'cpu_generation', 'cpu_performance_score']].head())

# This helpus us to calculate a CPU performance score, which is given as; speed × RAM × generation
# It first gives a message, confirming the score was created successfully.
# Also it displays the first 5 rows showing CPU speed, RAM, generation, and performance score.


Performance score created successfully!
   cpu_speed_ghz  ram  cpu_generation  cpu_performance_score
0            2.3    8             1.0                   18.4
1            1.8    8             1.0                   14.4
2            2.5    8           720.0                14400.0
3            2.7    1             1.0                    2.7
4            3.1    8             1.0                   24.8


In [36]:
# Interaction Features;

# First, I want to check what we have in these columns;
print("RAM column sample:", df['ram'].head().tolist())
print("Memory column sample:", df['memory'].head().tolist())
print("Data types:")
print(f"RAM dtype: {df['ram'].dtype}")
print(f"Memory dtype: {df['memory'].dtype}")

# To clean and convert both columns to numeric values;
def clean_storage_value(value):
    """
    Convert storage values like '512GB', '1TB', etc. to numeric GB values
    """
    if pd.isna(value):
        return np.nan
    
    # If any is already numeric, to return as is;
    if isinstance(value, (int, float)):
        return value
    
    # To onvert to string and clean;
    value_str = str(value).upper().strip()
    
    # To remove non-alphanumeric characters (keep numbers and TB/GB);
    value_str = ''.join(c for c in value_str if c.isalnum() or c in ['.', ' '])
    
    # For TB conversion (1TB = 1024GB);
    if 'TB' in value_str:
        num = value_str.replace('TB', '').strip()
        try:
            return float(num) * 1024
        except:
            return np.nan
    
    # For GB conversion;
    elif 'GB' in value_str:
        num = value_str.replace('GB', '').strip()
        try:
            return float(num)
        except:
            return np.nan
    
    # If there is no specified unit, I assume it is in GB;
    else:
        try:
            return float(value_str)
        except:
            return np.nan

# To apply cleaning to both columns;
df['ram_gb'] = df['ram'].apply(clean_storage_value)
df['memory_gb'] = df['memory'].apply(clean_storage_value)

# To fill missing values with median;
df['ram_gb'] = df['ram_gb'].fillna(df['ram_gb'].median())
df['memory_gb'] = df['memory_gb'].fillna(df['memory_gb'].median())

# Now, to create the interaction feature;
df['ram_memory_interaction'] = df['ram_gb'] * df['memory_gb']

print("Interaction feature created successfully!")
print(df[['ram', 'memory', 'ram_gb', 'memory_gb', 'ram_memory_interaction']].head())

# It prints sample values from df['ram'] and df['memory'] and their dtypes so you can see what formats you’re dealing with before cleaning:
# This helps convert storage-like strings into numeric GB. Even though the docstring mentions examples like "512GB" and "1TB", the function is written to be generic and used for both the ram and memory columns.
# It returns NaN if the input is missing;
# It normalizes the input to a string (e.g., value_str = str(value) internally);
# It parses numeric values and units (e.g., GB vs TB) and converts TB → GB (×1024);
# It handles fallback cases by trying float(value_str) if no explicit unit is found;
# It returns NaN if nothing sensible can be parsed.
# It applied a cleaner to the RAM and memory columns
# It also helped to fill the missing values in both columns
# It created an interaction feature, which captures how total memory capacity scales with RAM in a single number.
# Lastly, it displays a confirmation, and a preview.


RAM column sample: [8, 8, 8, 1, 8]
Memory column sample: ['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD', '256GB SSD']
Data types:
RAM dtype: int64
Memory dtype: object
Interaction feature created successfully!
   ram               memory  ram_gb  memory_gb  ram_memory_interaction
0    8            128GB SSD       8        NaN                     NaN
1    8  128GB Flash Storage       8        NaN                     NaN
2    8            256GB SSD       8        NaN                     NaN
3    1            512GB SSD       1        NaN                     NaN
4    8            256GB SSD       8        NaN                     NaN


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [37]:
# To fill missing values for the new features;
df['cpu_brand'].fillna('Unknown', inplace=True)
df['cpu_type'].fillna('Unknown', inplace=True)
df['gpu_brand'].fillna('Unknown', inplace=True)
df['gpu_type'].fillna('Unknown', inplace=True)
df['screen_type'].fillna('Standard', inplace=True)
df['memory_type'].fillna('Unknown', inplace=True)
df['os_type'].fillna('Unknown', inplace=True)
df['cpu_generation'].fillna(0, inplace=True)
df['cpu_speed_ghz'].fillna(df['cpu_speed_ghz'].median(), inplace=True)
df['resolution_width'].fillna(df['resolution_width'].median(), inplace=True)
df['resolution_height'].fillna(df['resolution_height'].median(), inplace=True)
df['resolution_pixels'].fillna(df['resolution_pixels'].median(), inplace=True)

df.head()

# This is for handling missing values for newly engineered features, amd also displaying first 5 rwos after filling the missing values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cpu_brand'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cpu_type'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,...,os_type,is_windows,is_macos,weight_category,is_premium_brand,product_line,cpu_performance_score,ram_gb,memory_gb,ram_memory_interaction
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,macOS,0,1,,1,MacBook,18.4,8,,
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,macOS,0,1,,1,Macbook,14.4,8,,
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,No OS,0,0,,1,250,14400.0,8,,
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,...,macOS,0,1,,1,MacBook,2.7,1,,
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,macOS,0,1,,1,MacBook,24.8,8,,


In [38]:
# T convert categorical features to numerical using Label Encoding;
from sklearn.preprocessing import LabelEncoder

categorical_features = ['cpu_brand', 'cpu_type', 'gpu_brand', 'gpu_type', 
                       'screen_type', 'memory_type', 'os_type', 
                       'screen_size_category', 'weight_category', 'product_line']

label_encoders = {}
for feature in categorical_features:
    if feature in df.columns:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature].astype(str))
        label_encoders[feature] = le

print("New features created successfully!")
print(f"Total features after engineering: {len(df.columns)}")
print("New columns:", [col for col in df.columns if col not in ['laptop_id', 'company', 'product', 'typename', 
                                                              'screenresolution', 'cpu', 'ram', 'memory', 
                                                              'gpu', 'opsys', 'weight', 'price_euros']])

df.head()

# The above codes encodes categorical features numerically.
# It prints a success message for confirmation message.
# It prints the total number of features after feature engineering.
# It prints the list of new feature columns.






New features created successfully!
Total features after engineering: 39
New columns: ['inches', 'cpu_brand', 'cpu_speed_ghz', 'cpu_generation', 'cpu_type', 'gpu_brand', 'gpu_type', 'gpu_is_dedicated', 'resolution_width', 'resolution_height', 'resolution_pixels', 'screen_type', 'is_touchscreen', 'has_ssd', 'has_hdd', 'has_flash_storage', 'memory_type', 'os_type', 'is_windows', 'is_macos', 'weight_category', 'is_premium_brand', 'product_line', 'cpu_performance_score', 'ram_gb', 'memory_gb', 'ram_memory_interaction']


Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,...,os_type,is_windows,is_macos,weight_category,is_premium_brand,product_line,cpu_performance_score,ram_gb,memory_gb,ram_memory_interaction
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,...,5,0,1,0,1,86,18.4,8,,
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,...,5,0,1,0,1,87,14.4,8,,
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,...,2,0,0,0,1,4,14400.0,8,,
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,1,512GB SSD,AMD Radeon Pro 455,...,5,0,1,0,1,86,2.7,1,,
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,...,5,0,1,0,1,86,24.8,8,,


In Summary:
The notebook performs feature engineering on a laptop dataset:
It helped in cleaning column names.
It helped in extracting CPU, GPU, screen, memory, OS, brand, and product features.
It created an interaction/performance features.
It also helped in handling of missing values.
It helped in encoding the categorical features.