In [25]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from random import random
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt

In [2]:
# Load data
df_laptops_original = pd.read_csv("../data/train.csv", sep=';')
df_laptops = df_laptops_original.copy()

### Cleaning data and solving missing values

In [3]:
# Get all touchscreen values to lower case letters (f.ex: from 'Glossy' --> 'glossy')
df_laptops['screen_surface'].replace({'Glossy': 'glossy', 'Matte': 'matte'}, inplace=True)

In [4]:
# Detect missing values
df_laptops.fillna(value=np.nan,inplace=True)
null_data = df_laptops[df_laptops.isnull().any(axis=1)]
df_laptops.columns[df_laptops.isnull().any()]  # ['screen_surface', 'cpu_details', 'detachable_keyboard', 'gpu', 'os', ...
       # 'os_details', 'weight']

Index(['screen_surface', 'cpu_details', 'detachable_keyboard', 'gpu', 'os',
       'os_details', 'weight'],
      dtype='object')

In [5]:
# Replace NaN screen_surface values with 'glossy' or 'matte' at random
for i in range(0,len(df_laptops)):
    if type(df_laptops.screen_surface[i]) == float:
        if random() >= 0.5:
            df_laptops.at[i,'screen_surface'] = 'glossy'
        else:
            df_laptops.at[i,'screen_surface'] = 'matte'

In [6]:
# Replace weight missing values by the median of all the weights
df_laptops['weight'] = df_laptops['weight'].fillna(value=df_laptops['weight'].median());

In [7]:
# Replace missing OS and OS_details based on brand
for i in range(0,len(df_laptops)):
    if type(df_laptops.os[i]) == float:
        if 'apple' in df_laptops.brand[i].lower():
            df_laptops.at[i,'os_details'] = 'macOS Catalina'
            df_laptops.at[i,'os'] = 'macOS'
        else:
            df_laptops.at[i,'os_details'] = 'Windows'
            df_laptops.at[i,'os'] = 'Windows 10'

In [8]:
# df_laptops.describe()
# Make index and column arrays to then use in the creation of the new df
index_array = np.array(range(0,len(df_laptops)))
column_names_array = np.array(list(df_laptops.columns), dtype=object)

In [9]:
# replace rest of missing values with most frequent simple imputer
imp = SimpleImputer(strategy="most_frequent")
temp_array = imp.fit_transform(df_laptops)

no_nulls_df = pd.DataFrame(data=temp_array[0:,0:],index=index_array, columns=column_names_array)

In [55]:
# Last missing values check
null_data = no_nulls_df[no_nulls_df.isnull().any(axis=1)]

### Feature engineering to-do's
- Partition screen sizes to big-medium-small
    - Small	if pixels 640px or less	and screen size: 4" to 6"; 20" to 65"
    - Medium if pixels 641px to 1007px and screen size: 7" to 12"
    - Large if pixels 1008px or greater and screen size: 13" and larger
- Divide 'cpu' column in cpu_brand (=AMD/Intel) and cpu_spec (=i7/i5/... for Intel or Pentium/Celeron/Ryzen/A8... for AMD)
- Divide GPU column in Intel/NVIDIA/AMD and for NVIDIA and AMD levels according to their series 
    - f.ex: (NVIDIA --> 20/16/10/900M/..., RADEON --> RX 5000/VII/RX VEGA/...)
- Find a suitable brand ranking
- Divide weight up in high/medium/low

In [61]:
# Remove unimportant columns
most_important_features_df = no_nulls_df.copy()
most_important_features_df = most_important_features_df.drop(columns = ['id',
                                                                        'name',         # Name information is to be found in other columns
                                                                        'base_name',    # Base name is partially in brand
                                                                        'os',           # OS_details is more important
                                                                        'discrete_gpu', # Information contained in GPU information
                                                                        'cpu_details']) # Only CPU column is good
most_important_features_df.head()

Unnamed: 0,brand,screen_size,pixels_x,pixels_y,screen_surface,touchscreen,cpu,detachable_keyboard,gpu,os_details,ram,ssd,storage,weight,min_price,max_price
0,Lenovo,15.6,1920,1080,glossy,1,Intel Core i7,0,Intel HD,Windows 10,8,0,1000,4.6,899.0,899.0
1,Razer,15.6,1920,1080,matte,0,Intel Core i7,0,NVIDIA GeForce RTX 2070 Max-Q,Windows 10 Home,16,512,512,4.63,2099.99,2099.99
2,HP,15.6,1366,768,glossy,0,AMD A6,0,AMD Radeon R4,Windows 10,8,0,500,4.63,439.0,449.0
3,Acer,15.6,1920,1080,matte,0,Intel Core i3,0,Intel UHD 620,Windows 10 Home,6,0,1000,5.3,375.0,449.0
4,HP,17.3,1600,900,glossy,0,Intel Core i5,0,Intel HD 620,Windows 10,8,0,1000,5.8,559.0,559.0


In [72]:
# One-hot encoding
temp_df = most_important_features_df
# df_brand = pd.DataFrame({'brand':list(temp_df.brand.unique())})

# pd.get_dummies(df_brand,prefix=['brand'])
pd.get_dummies(temp_df['brand'],prefix='brand')

Unnamed: 0,brand_Acer,brand_Alienware,brand_Apple,brand_Asus,brand_Dell,brand_Google,brand_HP,brand_Huawei,brand_Jumper,brand_LG,brand_Lenovo,brand_MSI,brand_Microsoft,brand_Other,brand_RCA,brand_Razer,brand_Samsung,brand_Toshiba
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
506,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
507,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
508,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [75]:
most_important_features_df

Unnamed: 0,brand,screen_size,pixels_x,pixels_y,screen_surface,touchscreen,cpu,detachable_keyboard,gpu,os_details,ram,ssd,storage,weight,min_price,max_price
0,Lenovo,15.6,1920,1080,glossy,1,Intel Core i7,0,Intel HD,Windows 10,8,0,1000,4.6,899,899
1,Razer,15.6,1920,1080,matte,0,Intel Core i7,0,NVIDIA GeForce RTX 2070 Max-Q,Windows 10 Home,16,512,512,4.63,2099.99,2099.99
2,HP,15.6,1366,768,glossy,0,AMD A6,0,AMD Radeon R4,Windows 10,8,0,500,4.63,439,449
3,Acer,15.6,1920,1080,matte,0,Intel Core i3,0,Intel UHD 620,Windows 10 Home,6,0,1000,5.3,375,449
4,HP,17.3,1600,900,glossy,0,Intel Core i5,0,Intel HD 620,Windows 10,8,0,1000,5.8,559,559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,Dell,15.6,1366,768,glossy,1,Intel Core i3,0,Intel HD 4400,Windows 8.1,4,0,500,6,383.51,383.51
506,Asus,17.3,1920,1080,matte,0,Intel Core i7,0,NVIDIA GeForce GTX 1070,Windows 10 Home,12,128,1128,6.39,1899.99,1899.99
507,Asus,15.6,1920,1080,matte,0,Intel Core i5,0,Intel UHD 620,Windows 10 Home,8,128,1128,3.7,499.99,669.97
508,HP,17.3,1600,900,glossy,0,Intel Core i5,0,Intel HD 620,Windows 10,8,0,1000,5.84,548.29,548.29


In [49]:
# Divide training set into inputs and targets
input_features = most_important_features_df[list(most_important_features_df.columns)[:-2]]
targets = most_important_features_df[list(most_important_features_df.columns)[-2:]]

# Divide dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(input_features, targets, test_size=0.20, random_state=42)

## Modeling

In [63]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)

In [64]:
# Fit on training data
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'Dell'