In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
train=pd.read_csv('laptops_train.csv')
test=pd.read_csv('laptops_test.csv')
df=pd.concat([train.reset_index(drop=True),test.reset_index(drop=True)])
df.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [4]:
df['Operating System Version']=df['Operating System Version'].fillna(df['Operating System Version'].mode()[0])

In [5]:
def order(x,num):
    return df[x].value_counts().head(num).index

In [6]:
# Considering only top 10 Manufacturers
screen_list=list(df['Manufacturer'].value_counts().index[0:10])
df=df[df['Manufacturer'].isin(screen_list)]

In [7]:
df.drop('Model Name',axis=1,inplace=True)

In [8]:
#we will consider top 5 size only
screen_list=list(df['Screen Size'].value_counts().index[0:6])
df=df[df['Screen Size'].isin(screen_list)]

In [9]:
df['Screen Size']=df['Screen Size'].str.replace('"','').astype('float')

In [10]:

df['Touchscreen'] = df['Screen'].str.contains('Touchscreen',case=False).astype('int')

In [11]:
df['Ips'] = df['Screen'].str.contains('Ips',case=False).astype('int')

In [12]:
#'\d'-Number ,'+'-one or more ,'x'-means x, '\d+'-one or more digits
#(r'(\d+)x\d+')=select one or more digits followed by x and then one or more digits
df['xres'] = df['Screen'].str.extract(r'(\d+)x\d+').astype('int')
df['yres'] = df['Screen'].str.extract(r'\d+x(\d+)').astype('int')
df.drop('Screen',axis=1,inplace=True)

In [13]:
df['PPI']=(((df['xres']**2+df['yres']**2)**0.5)/df['Screen Size'])

In [14]:
df.drop(['xres','yres'],axis=1,inplace=True)

In [15]:
df['Pro_Name']=df['CPU'].apply([lambda x:' '.join(x.split()[0:3])])
df['Pro_Name'].head()

0         Intel Core i5
1         Intel Core i5
2         Intel Core i5
4         Intel Core i5
5    AMD A9-Series 9420
Name: Pro_Name, dtype: object

In [16]:
def extractPro(x):
    if x=='Intel Core i5' or x=='Intel Core i7' or x=='Intel Core i3':
        return x
    else:
        return 'Others'
    
df['CPU']=df['Pro_Name'].apply(extractPro)
df.drop('Pro_Name',axis=1,inplace=True)

In [17]:
df['RAM']=df['RAM'].str.replace('GB','')
df['RAM'] = df['RAM'].astype('int')

In [18]:

top=df[' Storage'].value_counts().head(12).index
df=df[df[' Storage'].isin(top)]
df[' Storage']=df[' Storage'].str.replace('GB','')
df[' Storage']=df[' Storage'].str.replace('TB','000')
first = df[" Storage"].str.split("+", n = 1, expand = True)

df['part1']=first[0]
df['part2']=first[1]#we have null values here
# print(df[['part1','part2']])

df['part1']=df['part1'].str.strip()
df['part2'].fillna('0',inplace=True)

In [19]:
df["Storage1_SSD"] = df["part1"].apply(lambda x: 1 if "SSD" in x else 0)
df["Storage1_HDD"] = df["part1"].apply(lambda x: 1 if "HDD" in x else 0)
df["Storage1_Flash_Storage"] = df["part1"].apply(lambda x: 1 if "Flash Storage" in x else 0)

df["Storage2_SSD"] = df["part2"].apply(lambda x: 1 if "SSD" in x else 0)
df["Storage2_HDD"] = df["part2"].apply(lambda x: 1 if "HDD" in x else 0)
df["Storage2_Flash_Storage"] = df["part2"].apply(lambda x: 1 if "Flash Storage" in x else 0)

In [20]:
df['part1']=df['part1'].str.replace(r'\D','').astype(int)
df['part2'] = df['part2'].str.replace(r'\D','').astype(int)
df['part1'].dtype

dtype('int32')

In [21]:
df["SSD"]=(df["part1"]*df["Storage1_SSD"]+df["part2"]*df["Storage2_SSD"])
df["HDD"]=(df["part1"]*df["Storage1_HDD"]+df["part2"]*df["Storage2_HDD"])
df["Flash_Storage"]=(df["part1"]*df["Storage1_Flash_Storage"]+df["part2"]*df["Storage2_Flash_Storage"])

df.drop(columns=[' Storage','part1', 'part2','Storage1_SSD','Storage1_HDD', 'Storage1_Flash_Storage', 'Storage2_SSD', 'Storage2_HDD', 'Storage2_Flash_Storage'],axis=1,inplace=True)


In [22]:
n=df['GPU'].str.split(' ',n=1,expand=True)
df['GPU']=n[0]
df['GPU'].value_counts()

Intel     627
Nvidia    381
AMD       166
Name: GPU, dtype: int64

In [23]:
#we will not consider Operating System Version as a parameter because most of the laptops are coming with os pre installed 
#As you can see market is dominated by windows os and windows 10
df.drop('Operating System Version',axis=1,inplace=True)

In [24]:
df['Weight']=df['Weight'].str.extract(r'(\d+\.?\d*)').astype('float')

In [25]:
df.head()

Unnamed: 0,Manufacturer,Category,Screen Size,CPU,RAM,GPU,Operating System,Weight,Price,Touchscreen,Ips,PPI,SSD,HDD,Flash_Storage
0,Apple,Ultrabook,13.3,Intel Core i5,8,Intel,macOS,1.37,11912523.48,0,1,226.983005,128,0,0
2,HP,Notebook,15.6,Intel Core i5,8,Intel,No OS,1.86,5112900.0,0,0,141.211998,256,0,0
4,Apple,Ultrabook,13.3,Intel Core i5,8,Intel,macOS,1.37,16037611.2,0,1,226.983005,256,0,0
5,Acer,Notebook,15.6,Others,4,AMD,Windows,2.1,3556800.0,0,0,100.45467,0,500,0
8,Asus,Ultrabook,14.0,Intel Core i7,16,Nvidia,Windows,1.3,13293540.0,0,0,157.350512,512,0,0


In [26]:
df.columns

Index(['Manufacturer', 'Category', 'Screen Size', 'CPU', 'RAM', 'GPU',
       'Operating System', 'Weight', 'Price', 'Touchscreen', 'Ips', 'PPI',
       'SSD', 'HDD', 'Flash_Storage'],
      dtype='object')

In [27]:
from sklearn.model_selection import train_test_split
x=df.drop('Price',axis=1)
y=np.log(df['Price'])
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

# Model Building

In [28]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

In [29]:
step1 = ColumnTransformer(transformers=[
    ('Ohe',OneHotEncoder(sparse=False,drop='first'),[0,1,3,5,6])
],remainder='passthrough')

step2 = XGBRegressor()

pipe = Pipeline([
    ('step1',step1),
    ('step2',step2)
])

pipe.fit(x_train,y_train)

y_pred = pipe.predict(x_test)

print('R2 score',r2_score(y_test,y_pred))
print('MAE',mean_absolute_error(y_test,y_pred))

R2 score 0.9061022882398017
MAE 0.1401250733234124


In [30]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))

In [31]:
r1=dict(x_test.iloc[8])
r1

{'Manufacturer': 'MSI',
 'Category': 'Gaming',
 'Screen Size': 17.3,
 'CPU': 'Intel Core i7',
 'RAM': 16,
 'GPU': 'Nvidia',
 'Operating System': 'Windows',
 'Weight': 4.5,
 'Touchscreen': 0,
 'Ips': 0,
 'PPI': 127.33567457123111,
 'SSD': 128,
 'HDD': 1000,
 'Flash_Storage': 0}

In [32]:
[r1.values()]

[dict_values(['MSI', 'Gaming', 17.3, 'Intel Core i7', 16, 'Nvidia', 'Windows', 4.5, 0, 0, 127.33567457123111, 128, 1000, 0])]

In [33]:
# pd.DataFrame([r1.values()],columns=r1.keys())

In [34]:
y_test.iloc[2]

15.293416449126777