In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.feature_selection import mutual_info_regression
import lightgbm as lgbm
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pickle
import re
import os

In [13]:
df = pd.read_csv(os.path.join(os.path.pardir, "data", "laptop_price.csv"), encoding_errors = "ignore")
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


In [14]:
# drop first inches and id
df.drop(["Inches", "laptop_ID"], axis = 1, inplace = True)

In [15]:
X = df.drop("Price_euros", axis = 1).columns
y = "Price_euros"

In [23]:
def handle_storage_space(x):
    pattern = "\d+"
    spaces_lst = []
    for val in x.split():
        if "GB" in val or "TB" in val:
            if "TB" in val:
                spaces_lst.append(int(re.findall("\d+", val)[0])*1000)
            else:
                spaces_lst.append(int(re.findall("\d+", val)[0]))
    return sum(spaces_lst)

In [None]:
# data cleaning & preprocessing phase..
df["Ram"] = df["Ram"].apply(lambda x: x[:-2])
df["Ram"] = df["Ram"].astype("int8")
df["Weight"] = df["Weight"].apply(lambda x: x[:-2])
df["Weight"] = df["Weight"].astype("float16")
df["CPU_manufacturer"] = df["Cpu"].apply(lambda x: x.split()[0])
df["CPU_frequency"] = df["Cpu"].apply(lambda x: x.split()[-1])
df["CPU_frequency"] = df["CPU_frequency"].apply(lambda x: x[:-3])
df["CPU_frequency"] = df["CPU_frequency"].astype("float16")
df["CPU_model"] = df["Cpu"].apply(lambda x: x.split()[1:-1])
df["CPU_model"] = df["CPU_model"].apply(lambda x:''.join(val+'-' if idx != len(x)-1 else val for idx, val in enumerate(x)))
width_lst = df["ScreenResolution"].apply(lambda x:int(x.split()[-1].split(sep = "x")[0]) * 0.0264583333).values 
height_lst = df["ScreenResolution"].apply(lambda x:int(x.split()[-1].split(sep = "x")[1]) * 0.0264583333).values
df["screen_area_cm2"] = list(map(lambda x, y: x*y, width_lst, height_lst))
df["is_4K"] = df["ScreenResolution"].apply(lambda x: 1 if "4K Ultra HD" in x else 0)
df["is_touchscreen"] = df["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in x else 0)
df["is_full_HD"] = df["ScreenResolution"].apply(lambda x: 1 if "Full HD" in x else 0)
df["is_Quad"] = df["ScreenResolution"].apply(lambda x: 1 if "Quad" in x else 0)
df["is_HD+"] = df["ScreenResolution"].apply(lambda x: 1 if "HD+" in x else 0)
df["is_ips_panel"] = df["ScreenResolution"].apply(lambda x: 1 if "IPS Panel" in x else 0)
df["is_retina_display"] = df["ScreenResolution"].apply(lambda x: 1 if "Retina Display" in x else 0)
df["is_ssd"] = df["Memory"].apply(lambda x: 1 if "SSD" in x else 0)
df["is_hdd"] = df["Memory"].apply(lambda x: 1 if "HDD" in x else 0)
df["is_hybrid_storage"] = df["Memory"].apply(lambda x: 1 if "Hybrid" in x else 0)
df["is_flash_storage"] = df["Memory"].apply(lambda x: 1 if "Flash" in x else 0)
df["unique_storage_types"] = df["is_ssd"] + df["is_hdd"] + df["is_hybrid_storage"] + df["is_flash_storage"]
df["total_storage"] = df["Memory"].apply(handle_storage_space)
df["GPU_manufacturer"] = df["Gpu"].apply(lambda x:x.split()[0])
# Drop Raw Data....
df.drop(["ScreenResolution", "Cpu", "Memory", "Gpu"], axis = 1, inplace = True)