# Imports

In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [166]:
import os
import sys
import shutil
import kaggle

In [167]:
import re

In [168]:
from kaggle.api.kaggle_api_extended import KaggleApi

# Functions

In [169]:
def download_dataset(d_identifier, download_path='Data/', unzip=True):
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(d_identifier, path=download_path, unzip=unzip)

In [170]:
def clean_ram_column(series):
    """
    Cleans RAM size strings by removing '8GB' or 'GB' (case-insensitive)
    and converts the result to integers.
    """
    return (series
            .str.replace('GB', '', case=False, regex=False)
            .str.strip()
            .astype(int))


In [171]:
def remove_kg(series):
    return (series
            .str.replace('kg', '', case=False, regex=False)
            .str.strip()
            .astype(float))

# Data

## Using kaggle dataset

In [172]:
# kaggle_dir = os.path.expanduser("~/.kaggle")


In [173]:
# kapi = '../../kaggle.json'
# destpath = os.path.join(kaggle_dir, 'kaggle.json')
# m = shutil.copy(kapi, destpath)

In [174]:
# !kaggle datasets list -s "Uncleaned Laptop Price dataset"

In [175]:
# download_dataset('ehtishamsadiq/uncleaned-laptop-price-dataset')

## Loading Data

In [176]:
data = pd.read_csv('Data/laptopData.csv', index_col=None)
data.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [177]:
data.tail()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
1298,1298.0,Lenovo,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.64
1299,1299.0,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.72
1300,1300.0,Lenovo,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.12
1301,1301.0,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.92
1302,1302.0,Asus,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,4GB,500GB HDD,Intel HD Graphics,Windows 10,2.2kg,19660.32


# Data Cleaning

In [178]:
data = data.drop('Unnamed: 0', axis=1, errors='ignore')
data = data.dropna()

In [179]:
data.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [180]:
data.tail()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
1298,Lenovo,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.64
1299,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.72
1300,Lenovo,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.12
1301,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.92
1302,Asus,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,4GB,500GB HDD,Intel HD Graphics,Windows 10,2.2kg,19660.32


## Column Changes

In [181]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1273 entries, 0 to 1302
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1273 non-null   object 
 1   TypeName          1273 non-null   object 
 2   Inches            1273 non-null   object 
 3   ScreenResolution  1273 non-null   object 
 4   Cpu               1273 non-null   object 
 5   Ram               1273 non-null   object 
 6   Memory            1273 non-null   object 
 7   Gpu               1273 non-null   object 
 8   OpSys             1273 non-null   object 
 9   Weight            1273 non-null   object 
 10  Price             1273 non-null   float64
dtypes: float64(1), object(10)
memory usage: 119.3+ KB


Short information the price is in Indian Rupee

In [182]:
data.rename(columns={'Cpu': 'CPU', 'Gpu': 'GPU', 'Ram': 'RAM_GB', 'Weight': 'Weight_KG', 'Price': 'Price_Rupee'},
        inplace=True)

In [183]:
data.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,CPU,RAM_GB,Memory,GPU,OpSys,Weight_KG,Price_Rupee
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


## Changing data types

In [184]:
data.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,CPU,RAM_GB,Memory,GPU,OpSys,Weight_KG,Price_Rupee
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


### Inches

In [185]:
data['Inches'] = data['Inches'].replace('?', np.nan)

In [186]:
data['Inches'] = data['Inches'].astype(float)
data = data.dropna()

### Ram

In [187]:
data['RAM_GB'] = clean_ram_column(data['RAM_GB'])

### Weight

In [188]:
data['Weight_KG'] = data['Weight_KG'].replace('?', np.nan)
data = data.dropna()
data['Weight_KG'] = remove_kg(data['Weight_KG'])

### CPU

In [189]:
data[['CPU_model', 'CPU_GHz']] = data['CPU'].str.extract(r'(.+)\s(\d+(?:\.\d+)?)GHz')

# Convert cpu_ghz to float
data['CPU_GHz'] = data['CPU_GHz'].astype(float)


In [190]:
data.head(1)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,CPU,RAM_GB,Memory,GPU,OpSys,Weight_KG,Price_Rupee,CPU_model,CPU_GHz
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,Intel Core i5,2.3


### ScreenResolution

In [191]:
len(set(data.ScreenResolution))

40

In [192]:
set(data.ScreenResolution)

{'1366x768',
 '1440x900',
 '1600x900',
 '1920x1080',
 '2560x1440',
 '4K Ultra HD / Touchscreen 3840x2160',
 '4K Ultra HD 3840x2160',
 'Full HD / Touchscreen 1920x1080',
 'Full HD 1920x1080',
 'IPS Panel 1366x768',
 'IPS Panel 2560x1440',
 'IPS Panel 4K Ultra HD / Touchscreen 3840x2160',
 'IPS Panel 4K Ultra HD 3840x2160',
 'IPS Panel Full HD / Touchscreen 1920x1080',
 'IPS Panel Full HD 1366x768',
 'IPS Panel Full HD 1920x1080',
 'IPS Panel Full HD 1920x1200',
 'IPS Panel Full HD 2160x1440',
 'IPS Panel Full HD 2560x1440',
 'IPS Panel Quad HD+ / Touchscreen 3200x1800',
 'IPS Panel Quad HD+ 2560x1440',
 'IPS Panel Quad HD+ 3200x1800',
 'IPS Panel Retina Display 2304x1440',
 'IPS Panel Retina Display 2560x1600',
 'IPS Panel Retina Display 2736x1824',
 'IPS Panel Retina Display 2880x1800',
 'IPS Panel Touchscreen / 4K Ultra HD 3840x2160',
 'IPS Panel Touchscreen 1366x768',
 'IPS Panel Touchscreen 1920x1200',
 'IPS Panel Touchscreen 2400x1600',
 'IPS Panel Touchscreen 2560x1440',
 'Quad HD

In [193]:
# Extract width and height using regex
data[['ScreenRes_width', 'ScreenRes_height']] = data['ScreenResolution'].str.extract(r'(\d+)x(\d+)').astype(int)

# Determine touchscreen presence
data['TouchScreen'] = data['ScreenResolution'].str.contains('Touchscreen', case=False).map({True: 'Yes', False: 'No'})
data = data.drop(columns=['ScreenResolution'])

### OpSys

In [194]:
opsys = data.OpSys.unique()
opsys

array(['macOS', 'No OS', 'Windows 10', 'Mac OS X', 'Linux',
       'Windows 10 S', 'Chrome OS', 'Windows 7', 'Android'], dtype=object)

### Memory

In [195]:
mem = data.Memory.unique()
mem

array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
       '500GB HDD', '256GB Flash Storage', '1TB HDD',
       '128GB SSD +  1TB HDD', '256GB SSD +  256GB SSD',
       '64GB Flash Storage', '32GB Flash Storage', '256GB SSD +  1TB HDD',
       '256GB SSD +  2TB HDD', '32GB SSD', '2TB HDD', '64GB SSD',
       '1.0TB Hybrid', '512GB SSD +  1TB HDD', '1TB SSD',
       '256GB SSD +  500GB HDD', '128GB SSD +  2TB HDD',
       '512GB SSD +  512GB SSD', '16GB SSD', '16GB Flash Storage',
       '512GB SSD +  256GB SSD', '512GB SSD +  2TB HDD',
       '64GB Flash Storage +  1TB HDD', '180GB SSD', '1TB HDD +  1TB HDD',
       '32GB HDD', '1TB SSD +  1TB HDD', '?', '512GB Flash Storage',
       '128GB HDD', '240GB SSD', '8GB SSD', '508GB Hybrid', '1.0TB HDD',
       '512GB SSD +  1.0TB Hybrid', '256GB SSD +  1.0TB Hybrid'],
      dtype=object)

new column names: Memory_Size1, Memory_Size2, Memory_Type1, Memory_Type2

In [196]:
# Function to convert TB/GB to GB as number
def convert_to_gb(value):
    match = re.match(r'([\d\.]+)(TB|GB)', value.strip())
    if not match:
        return 0
    size, unit = match.groups()
    size = float(size)
    return int(size * 1024) if unit == 'TB' else int(size)

# Function to extract memory details
def parse_memory(mem_string):
    if mem_string == '?' or pd.isna(mem_string):
        return [0, 'Unknown', 0, 'No second memory']

    parts = mem_string.split('+')
    parts = [p.strip() for p in parts]

    # First memory
    match1 = re.match(r'([\d\.]+(?:TB|GB))\s+(.*)', parts[0])
    size1 = convert_to_gb(match1.group(1)) if match1 else 0
    type1 = match1.group(2) if match1 else 'Unknown'

    # Second memory (optional)
    if len(parts) > 1:
        match2 = re.match(r'([\d\.]+(?:TB|GB))\s+(.*)', parts[1])
        size2 = convert_to_gb(match2.group(1)) if match2 else 0
        type2 = match2.group(2) if match2 else 'Unknown'
    else:
        size2 = 0
        type2 = 'No second memory'

    return [size1, type1.strip(), size2, type2.strip()]

# Apply parsing function
parsed = data['Memory'].apply(parse_memory)
data[['Memory_Size1_GB', 'Memory_Type1', 'Memory_Size2_GB', 'Memory_Type2']] = pd.DataFrame(parsed.tolist(), index=data.index)

data = data.drop(columns=['Memory'])

In [199]:
data = data.drop(columns=['CPU'])

In [201]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1271 entries, 0 to 1302
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Company           1271 non-null   object 
 1   TypeName          1271 non-null   object 
 2   Inches            1271 non-null   float64
 3   RAM_GB            1271 non-null   int64  
 4   GPU               1271 non-null   object 
 5   OpSys             1271 non-null   object 
 6   Weight_KG         1271 non-null   float64
 7   Price_Rupee       1271 non-null   float64
 8   CPU_model         1271 non-null   object 
 9   CPU_GHz           1271 non-null   float64
 10  ScreenRes_width   1271 non-null   int64  
 11  ScreenRes_height  1271 non-null   int64  
 12  TouchScreen       1271 non-null   object 
 13  Memory_Size1_GB   1271 non-null   int64  
 14  Memory_Type1      1271 non-null   object 
 15  Memory_Size2_GB   1271 non-null   int64  
 16  Memory_Type2      1271 non-null   object 
dtype

In [200]:
data.head()

Unnamed: 0,Company,TypeName,Inches,RAM_GB,GPU,OpSys,Weight_KG,Price_Rupee,CPU_model,CPU_GHz,ScreenRes_width,ScreenRes_height,TouchScreen,Memory_Size1_GB,Memory_Type1,Memory_Size2_GB,Memory_Type2
0,Apple,Ultrabook,13.3,8,Intel Iris Plus Graphics 640,macOS,1.37,71378.6832,Intel Core i5,2.3,2560,1600,No,128,SSD,0,No second memory
1,Apple,Ultrabook,13.3,8,Intel HD Graphics 6000,macOS,1.34,47895.5232,Intel Core i5,1.8,1440,900,No,128,Flash Storage,0,No second memory
2,HP,Notebook,15.6,8,Intel HD Graphics 620,No OS,1.86,30636.0,Intel Core i5 7200U,2.5,1920,1080,No,256,SSD,0,No second memory
3,Apple,Ultrabook,15.4,16,AMD Radeon Pro 455,macOS,1.83,135195.336,Intel Core i7,2.7,2880,1800,No,512,SSD,0,No second memory
4,Apple,Ultrabook,13.3,8,Intel Iris Plus Graphics 650,macOS,1.37,96095.808,Intel Core i5,3.1,2560,1600,No,256,SSD,0,No second memory


In [203]:
data = data.dropna()

In [204]:
data.to_csv('Data/laptopData_cleaned.csv', index=False)