In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import re

sns.set()


In [2]:
df_lap = pd.read_csv('laptop_price.csv', encoding='latin-1')
df_lap.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


## Data cleaning

In [3]:
df_lap.isna().sum()

laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64

In [4]:
df_lap.dtypes

laptop_ID             int64
Company              object
Product              object
TypeName             object
Inches              float64
ScreenResolution     object
Cpu                  object
Ram                  object
Memory               object
Gpu                  object
OpSys                object
Weight               object
Price_euros         float64
dtype: object

In [5]:

df_lap['Weight'] = df_lap['Weight'].apply(lambda x: re.search('[+-]?([0-9]*[.])?[0-9]+', x).group())
df_lap['Weight'] = df_lap.Weight.astype('float')

df_lap.drop('laptop_ID', inplace=True, axis=1)

## Data Processing

In [6]:
#split train and target data

train, train_target = df_lap.iloc[:, :-1], df_lap.iloc[:, -1]

In [7]:
num_col = [x for x in train.columns if train[x].dtype != 'object']
str_col = [x for x in train.columns if train[x].dtype == 'object']

In [8]:
#standardize numerical columns

for i in num_col:
    train[i] = (train[i] - train[i].mean()) / train[i].std()

In [9]:
train_target = np.log(train_target)

In [10]:
train = pd.get_dummies(train)

In [11]:
train

Unnamed: 0,Inches,Weight,Company_Acer,Company_Apple,Company_Asus,Company_Chuwi,Company_Dell,Company_Fujitsu,Company_Google,Company_HP,...,Gpu_Nvidia Quadro M620M,OpSys_Android,OpSys_Chrome OS,OpSys_Linux,OpSys_Mac OS X,OpSys_No OS,OpSys_Windows 10,OpSys_Windows 10 S,OpSys_Windows 7,OpSys_macOS
0,-1.203945,-1.004897,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,-1.203945,-1.049978,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0.408615,-0.268581,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,0.268392,-0.313661,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,-1.203945,-1.004897,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,-0.713166,-0.358742,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1299,-1.203945,-1.110085,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1300,-0.713166,-0.809548,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1301,0.408615,0.227306,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


## train a Deep neuaral network

In [12]:
# split train and validation

train_inputs, train_targets = train[:int(.8 * len(train))], train_target[:int(.8 * len(train_target))]
validation_inputs, validation_targets = train[int(.8 * len(train)):], train_target[int(.8 * len(train_target)):]

In [13]:
#train a dnn

input_size = train_inputs.shape[1]
hidden_size = 100
output_size = 1

model = tf.keras.Sequential([
    tf.keras.layers.Dense(input_size, kernel_initializer='he_normal'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(hidden_size, activation='relu'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(hidden_size, activation='relu'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(output_size, activation='relu')
])

opt = tf.keras.optimizers.Adam(
        learning_rate=.0001
    )

model.compile(optimizer=opt, loss='mean_absolute_percentage_error')

batch_size = 32
max_epochs = 50

model.fit(
    train_inputs,
    train_targets,
    epochs = max_epochs,
    batch_size = batch_size,
    validation_data=(validation_inputs, validation_targets),
    verbose=2
)



User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=false
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hyper'
   KMP_REDUCTION_BARRIER='1,1'
   KMP_REDUCTION_BAR

Epoch 1/50
33/33 - 1s - loss: 92.2015 - val_loss: 82.3825
Epoch 2/50
33/33 - 0s - loss: 62.2388 - val_loss: 37.6786
Epoch 3/50
33/33 - 0s - loss: 18.2206 - val_loss: 12.4026
Epoch 4/50
33/33 - 0s - loss: 11.2583 - val_loss: 8.6360
Epoch 5/50
33/33 - 0s - loss: 9.1112 - val_loss: 7.2313
Epoch 6/50
33/33 - 0s - loss: 8.6753 - val_loss: 6.4872
Epoch 7/50
33/33 - 0s - loss: 8.3030 - val_loss: 5.7362
Epoch 8/50
33/33 - 0s - loss: 7.7995 - val_loss: 5.6435
Epoch 9/50
33/33 - 0s - loss: 7.4865 - val_loss: 5.3838
Epoch 10/50
33/33 - 0s - loss: 7.6796 - val_loss: 5.7402
Epoch 11/50
33/33 - 0s - loss: 7.3338 - val_loss: 4.9780
Epoch 12/50
33/33 - 0s - loss: 7.1372 - val_loss: 5.8553
Epoch 13/50
33/33 - 0s - loss: 7.0183 - val_loss: 4.9777
Epoch 14/50
33/33 - 0s - loss: 7.1855 - val_loss: 5.3446
Epoch 15/50
33/33 - 0s - loss: 7.1613 - val_loss: 5.0513
Epoch 16/50
33/33 - 0s - loss: 7.0202 - val_loss: 4.5495
Epoch 17/50
33/33 - 0s - loss: 6.8739 - val_loss: 6.7246
Epoch 18/50
33/33 - 0s - loss: 6.

<keras.callbacks.History at 0x7f705b898450>

### we got a 96% on the validation set

## lets try some other regression algorithms

In [14]:
from sklearn.ensemble import RandomForestRegressor

r_reg = RandomForestRegressor()
r_reg.fit(train_inputs, train_targets)
r_reg.score(validation_inputs, validation_targets)

0.8228506453266159

In [15]:
from sklearn.tree import DecisionTreeRegressor 

d_reg = DecisionTreeRegressor()
d_reg.fit(train_inputs, train_targets)
d_reg.score(validation_inputs, validation_targets)

0.7647301335886396

### we get on the first 81% and on the second 76% which is way lower than our deep netwok