In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns   
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('laptop_Price.csv', encoding='latin1')

In [3]:
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


In [5]:
df.describe()

Unnamed: 0,laptop_ID,Inches,Price_euros
count,1303.0,1303.0,1303.0
mean,660.155794,15.017191,1123.686992
std,381.172104,1.426304,699.009043
min,1.0,10.1,174.0
25%,331.5,14.0,599.0
50%,659.0,15.6,977.0
75%,990.5,15.6,1487.88
max,1320.0,18.4,6099.0


In [6]:
df.isnull().sum()

laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64

In [7]:
#Converted all columns names to lower case for easy cleaning and future processing
df.columns = df.columns.str.lower()
df.columns


Index(['laptop_id', 'company', 'product', 'typename', 'inches',
       'screenresolution', 'cpu', 'ram', 'memory', 'gpu', 'opsys', 'weight',
       'price_euros'],
      dtype='object')

In [8]:
# Remove 'GB' from RAM and convert to integer
df['ram'] = df['ram'].str.replace("GB","").astype(int)

# Remove 'kg' from Weight and convert to float
df['weight'] = df['weight'].str.replace("kg","").astype(float)

# Clean up 'Memory' column (split HDD/SSD/etc.)
df['memory'] = df['memory'].str.replace("GB","").str.replace("TB","000")
df['memory'] = df['memory'].str.replace(r'\D', '', regex=True).replace('', '0').astype(int)

# Convert Inches to float
df['inches'] = df['inches'].astype(float)

In [9]:
# Identify categorical columns with missing values
categorical_cols = df.select_dtypes(include=['object']).columns
missing_categorical = df[categorical_cols].isnull().sum()
missing_categorical = missing_categorical[missing_categorical > 0]

print("Categorical columns with missing values:")
print(missing_categorical)

# Handle each categorical column based on number of unique values
for col in missing_categorical.index:
    unique_count = df[col].nunique()
    
    if unique_count <= 15: 
        mode_value = df[col].mode()[0]
        print(f"Filling {missing_categorical[col]} missing values in '{col}' with mode: '{mode_value}'")
        df[col].fillna(mode_value, inplace=True)
    else:
        print(f"Filling {missing_categorical[col]} missing values in '{col}' with 'Unknown'")
        df[col].fillna('Unknown', inplace=True)

print("\nMissing values after handling categorical columns:")
print(df[missing_categorical.index].isnull().sum())

Categorical columns with missing values:
Series([], dtype: int64)

Missing values after handling categorical columns:
Series([], dtype: float64)


In [10]:
# Identify missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values[missing_values > 0])

# Handle missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if col == 'Price_euros':
            df.dropna(subset=[col], inplace=True)
        elif df[col].dtype in ['int64', 'float64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            if df[col].nunique() < 20:
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna('Unknown', inplace=True)

print("\nMissing values after handling:\n", df.isnull().sum().sum())


Missing values per column:
 Series([], dtype: int64)

Missing values after handling:
 0


In [11]:
# Address outliers in numerical features
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.clip(df[col], lower_bound, upper_bound)

In [12]:
# # Convert categorical features to numerical representations

# # 1. Label Encoding for ordinal categorical features
label_encoder = LabelEncoder()
ordinal_features = ['Ram']  # Add other ordinal features if any

for feature in ordinal_features:
     if feature in df.columns:
         df[feature] = label_encoder.fit_transform(df[feature])
         print(f"Label encoded {feature}: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")


df.head()

Unnamed: 0,laptop_id,company,product,typename,inches,screenresolution,cpu,ram,memory,gpu,opsys,weight,price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128,Intel Iris Plus Graphics 640,macOS,1.37,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128,Intel HD Graphics 6000,macOS,1.34,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256,Intel HD Graphics 620,No OS,1.86,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,14,512,AMD Radeon Pro 455,macOS,1.83,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256,Intel Iris Plus Graphics 650,macOS,1.37,1803.6
