Laptop Prices Dataset Preprocessing

In [742]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import skew
import re

In [743]:
# Load the CSV file
file_path = 'laptopData.csv'
data = pd.read_csv(file_path)
data

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0000
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.3360
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.8080
...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1298.0,Lenovo,2 in 1 Convertible,14,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,33992.6400
1299,1299.0,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16GB,512GB SSD,Intel HD Graphics 520,Windows 10,1.3kg,79866.7200
1300,1300.0,Lenovo,Notebook,14,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.5kg,12201.1200
1301,1301.0,HP,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,40705.9200


In [744]:
# Remove the 'Unnamed: 0' column
data = data.drop(columns=['Unnamed: 0'])


In [745]:
# Check for duplicate rows and remove them
data = data.drop_duplicates()

In [746]:
# Replace invalid values (e.g., '?') with NaN
data.replace('?', np.nan, inplace=True)

In [747]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)
data_train.shape

(996, 11)

In [748]:
data_train

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
639,Dell,Notebook,15.6,4K Ultra HD / Touchscreen 3840x2160,Intel Core i7 7700HQ 2.8GHz,16GB,1TB SSD,Nvidia GeForce GTX 1050,Windows 10,2.06kg,127818.720
400,Lenovo,Notebook,15.6,1366x768,Intel Celeron Dual Core N3350 1.1GHz,4GB,1TB HDD,Intel HD Graphics 500,Windows 10,2.2kg,16303.680
261,Lenovo,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,256GB SSD,Intel HD Graphics 520,No OS,2.2kg,23656.320
1081,Lenovo,Gaming,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 6820HK 2.7GHz,32GB,512GB SSD + 1.0TB Hybrid,Nvidia GeForce GTX 980M,Windows 10,4.6kg,172627.200
1055,HP,Notebook,15.6,1366x768,Intel Core i3 6100U 2.3GHz,4GB,500GB HDD,Intel HD Graphics 520,Windows 10,2.31kg,37570.392
...,...,...,...,...,...,...,...,...,...,...,...
1071,HP,Notebook,15.6,1366x768,Intel Core i5 6200U 2.3GHz,4GB,500GB HDD,Intel HD Graphics 520,Windows 10,2.31kg,63882.720
1122,HP,Notebook,14,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 7,1.43kg,80612.640
1157,Lenovo,Ultrabook,14,Full HD 1920x1080,Intel Core i7 6600U 2.6GHz,12GB,512GB SSD,Intel HD Graphics 520,Windows 7,1.4kg,122490.720
881,HP,2 in 1 Convertible,15.6,Touchscreen 1366x768,Intel Core i5 7200U 2.5GHz,4GB,500GB HDD,Intel HD Graphics 620,Windows 10,2.3kg,36486.144


In [749]:
data_train = data.drop_duplicates()


In [750]:
# Feature extraction functions
def extract_cpu_info(cpu):
    if pd.isna(cpu):
        return None
    cpu_info = re.findall(r'(\d+\.\d+GHz)', cpu)
    return float(cpu_info[0].replace('GHz', '')) if cpu_info else None

def extract_memory_info(memory):
    if pd.isna(memory):
        return None
    memory_info = re.findall(r'(\d+GB)', memory)
    return int(memory_info[0].replace('GB', '')) if memory_info else None

def extract_resolution_info(screen_resolution):
    if pd.isna(screen_resolution):
        return None, None
    resolution_info = re.findall(r'(\d+)x(\d+)', screen_resolution)
    return int(resolution_info[0][0]), int(resolution_info[0][1]) if resolution_info else (None, None)

In [751]:
data_test['Ram'] = data_test['Ram'].str.replace('GB', '').astype(float)
data_train['Ram'] = data_train['Ram'].str.replace('GB', '').astype(float)

In [752]:
# Apply feature extraction to training data
data_train['Cpu_Speed_GHz'] = data_train['Cpu'].apply(extract_cpu_info)
data_train['Memory_GB'] = data_train['Memory'].apply(extract_memory_info)
data_train['ScreenWidth'], data_train['ScreenHeight'] = zip(*data_train['ScreenResolution'].apply(extract_resolution_info))

In [753]:
# Apply feature extraction to testing data
data_test['Cpu_Speed_GHz'] = data_test['Cpu'].apply(extract_cpu_info)
data_test['Memory_GB'] = data_test['Memory'].apply(extract_memory_info)

data_test['ScreenWidth'], data_test['ScreenHeight'] = zip(*data_test['ScreenResolution'].apply(extract_resolution_info))

In [754]:
data_test

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Cpu_Speed_GHz,Memory_GB,ScreenWidth,ScreenHeight
755,Lenovo,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i5 6260U 1.8GHz,8.0,256GB SSD,Intel HD Graphics 540,Windows 10,1.3kg,63882.72,1.8,256.0,3200,1800
452,HP,Workstation,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 7820HQ 2.9GHz,8.0,512GB SSD,Intel HD Graphics 620,Windows 10,2.31kg,81731.52,2.9,512.0,1920,1080
1144,HP,2 in 1 Convertible,13.3,Touchscreen 2560x1440,Intel Core i7 6600U 2.6GHz,8.0,256GB SSD,Intel HD Graphics 520,Windows 10,1.48kg,95850.72,2.6,256.0,2560,1440
342,HP,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i3 7100U 2.4GHz,8.0,1TB HDD,Nvidia GeForce 930MX,Windows 10,2.1kg,38148.48,2.4,,1920,1080
335,HP,Notebook,14,Full HD 1920x1080,Intel Core i5 7300U 2.6GHz,8.0,256GB SSD,Intel HD Graphics 620,Windows 10,1.48kg,67559.04,2.6,256.0,1920,1080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947,Lenovo,Notebook,15.6,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8.0,1TB HDD,Nvidia GeForce 920MX,No OS,2.2kg,36496.80,2.7,,1920,1080
1171,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16.0,512GB SSD,Intel UHD Graphics 620,Windows 10,2.1kg,61751.52,1.8,512.0,1920,1080
1087,Lenovo,Notebook,14,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8.0,180GB SSD,Intel HD Graphics 520,Windows 7,1.7kg,63882.72,2.3,180.0,1920,1080
1218,Lenovo,Gaming,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8.0,128GB SSD + 1TB HDD,Nvidia GeForce GTX 960<U+039C>,Windows 10,2.6kg,67772.16,2.6,128.0,1920,1080


In [755]:
# Remove original complex columns after extraction
data_train = data_train.drop(columns=['Cpu', 'Memory', 'ScreenResolution'])
data_test = data_test.drop(columns=['Cpu', 'Memory', 'ScreenResolution'])

In [756]:
# Initialize label encoders for each categorical feature
label_encoders = {feature: LabelEncoder() for feature in categorical_features}

In [757]:
# Apply label encoding to the training set
for feature in categorical_features:
    X_train[feature] = label_encoders[feature].fit_transform(X_train[feature])


In [758]:
# Handle unseen labels in the test set
for feature in categorical_features:
    unseen_labels = set(X_test[feature]) - set(label_encoders[feature].classes_)
    if unseen_labels:
        for label in unseen_labels:
            X_test = X_test[X_test[feature] != label]
            y_test = y_test[X_test.index]

In [759]:
categorical_train = data_train.select_dtypes(exclude = np.number).columns

In [760]:
categorical_test = data_test.select_dtypes(exclude = np.number).columns

In [761]:
# Apply label encoding to the testing set
data1_train= data_train.copy()
classes = dict()
le = LabelEncoder()
for col in categorical_train:
    data1_train[col]= le.fit_transform(data1_train[col])
    classes[col] = le.classes_



In [762]:
data1_test= data_test.copy()
classes = dict()
le = LabelEncoder()
for col in categorical_test:
    data1_test[col]= le.fit_transform(data1_test[col])
    classes[col] = le.classes_

In [763]:
data1_test=data1_test.dropna()
data1_train=data1_train.dropna()

In [764]:
# Define features and target
X_train = data1_train.drop(columns=['Price'])
y_train = data1_train['Price']
X_test = data1_test.drop(columns=['Price'])
y_test = data1_test['Price']

In [765]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [766]:
# Define the models and their parameter grids
models = {
    'LinearRegression': {
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False],
            'copy_X': [True, False]
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.1, 1.0, 10.0],
            'fit_intercept': [True, False]
        }
    },
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 1.0, 10.0],
            'fit_intercept': [True, False]
        }
    }
}

In [767]:
# Train and find the best parameters for each model using GridSearchCV
best_models = {}
for name, model_params in models.items():
    grid_search = GridSearchCV(model_params['model'], model_params['params'], cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best {name}: {grid_search.best_params_}")

Best LinearRegression: {'copy_X': True, 'fit_intercept': True}
Best Lasso: {'alpha': 0.1, 'fit_intercept': True}
Best Ridge: {'alpha': 0.1, 'fit_intercept': True}


  model = cd_fast.enet_coordinate_descent(


In [768]:
# Evaluate the best models on the test set
results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = {'R2 Score': r2, 'Mean Squared Error': mse}
    print(f"{name} - R2 Score: {r2}, Mean Squared Error: {mse}")

# Display a summary of results
print("\nSummary of model performance on test data:")
for name, metrics in results.items():
    print(f"{name}:")
    print(f"  R2 Score: {metrics['R2 Score']}")
    print(f"  Mean Squared Error: {metrics['Mean Squared Error']}")

LinearRegression - R2 Score: 0.6525311860357391, Mean Squared Error: 530922705.9481854
Lasso - R2 Score: 0.6525292266440156, Mean Squared Error: 530925699.8443685
Ridge - R2 Score: 0.6524957885646494, Mean Squared Error: 530976792.30177855

Summary of model performance on test data:
LinearRegression:
  R2 Score: 0.6525311860357391
  Mean Squared Error: 530922705.9481854
Lasso:
  R2 Score: 0.6525292266440156
  Mean Squared Error: 530925699.8443685
Ridge:
  R2 Score: 0.6524957885646494
  Mean Squared Error: 530976792.30177855
