#------------------------------------------------------------------------------
### STEP 0: INITIALIZE LIBRARIES
#------------------------------------------------------------------------------

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
from random import random
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import mean_absolute_error
import matplotlib
import matplotlib.pyplot as plt

In [2]:
# Load data
df_laptops_original = pd.read_csv("../data/train.csv", sep=';')
df_laptops = df_laptops_original.copy()

### Cleaning data and solving missing values

In [3]:
# Get all touchscreen values to lower case letters (f.ex: from 'Glossy' --> 'glossy')
df_laptops['screen_surface'].replace({'Glossy': 'glossy', 'Matte': 'matte'}, inplace=True)

In [4]:
# Detect missing values
df_laptops.fillna(value=np.nan,inplace=True)
null_data = df_laptops[df_laptops.isnull().any(axis=1)]
df_laptops.columns[df_laptops.isnull().any()]  # ['screen_surface', 'cpu_details', 'detachable_keyboard', 'gpu', 'os', ...
       # 'os_details', 'weight']

Index(['screen_surface', 'cpu_details', 'detachable_keyboard', 'gpu', 'os',
       'os_details', 'weight'],
      dtype='object')

In [5]:
# Replace NaN screen_surface values with 'glossy' or 'matte' at random
for i in range(0,len(df_laptops)):
    if type(df_laptops.screen_surface[i]) == float:
        if random() >= 0.5:
            df_laptops.at[i,'screen_surface'] = 'glossy'
        else:
            df_laptops.at[i,'screen_surface'] = 'matte'

In [6]:
# Replace weight missing values by the median of all the weights
df_laptops['weight'] = df_laptops['weight'].fillna(value=df_laptops['weight'].median());

In [7]:
# Replace missing OS and OS_details based on brand
for i in range(0,len(df_laptops)):
    if type(df_laptops.os[i]) == float:        # When the value is missing
        if 'apple' in df_laptops.brand[i].lower():
            df_laptops.at[i,'os_details'] = 'macOS Catalina'
            df_laptops.at[i,'os'] = 'macOS'
        else:
            df_laptops.at[i,'os_details'] = 'Windows'
            df_laptops.at[i,'os'] = 'Windows 10'

In [8]:
# df_laptops.describe()
# Make index and column arrays to then use in the creation of the new df
index_array = np.array(range(0,len(df_laptops)))
column_names_array = np.array(list(df_laptops.columns), dtype=object)

In [9]:
# replace rest of missing values with most frequent simple imputer
imp = SimpleImputer(strategy="most_frequent")
temp_array = imp.fit_transform(df_laptops)

no_nulls_df = pd.DataFrame(data=temp_array[0:,0:],index=index_array, columns=column_names_array)

In [10]:
# Last missing values check
null_data = no_nulls_df[no_nulls_df.isnull().any(axis=1)]

### Feature engineering to-do's (just ideas)
- Partition screen sizes to big-medium-small
    - Small	if pixels 640px or less	and screen size: 4" to 6"; 20" to 65"
    - Medium if pixels 641px to 1007px and screen size: 7" to 12"
    - Large if pixels 1008px or greater and screen size: 13" and larger
- Divide 'cpu' column in cpu_brand (=AMD/Intel) and cpu_spec (=i7/i5/... for Intel or Pentium/Celeron/Ryzen/A8... for AMD)
- Divide GPU column in Intel/NVIDIA/AMD and for NVIDIA and AMD levels according to their series 
    - f.ex: (NVIDIA --> 20/16/10/900M/..., RADEON --> RX 5000/VII/RX VEGA/...)
- Find a suitable brand ranking
- Divide weight up in high/medium/low

In [11]:
# Remove unimportant columns
most_important_features_df = no_nulls_df.copy()
most_important_features_df = most_important_features_df.drop(columns = ['id',
                                                                        'name',         # Name information is to be found in other columns
                                                                        'base_name',    # Base name is partially in brand
                                                                        'os',           # OS_details is more important
                                                                        'discrete_gpu', # Information contained in GPU information
                                                                        'cpu_details']) # Only CPU column is good
most_important_features_df.head()

Unnamed: 0,brand,screen_size,pixels_x,pixels_y,screen_surface,touchscreen,cpu,detachable_keyboard,gpu,os_details,ram,ssd,storage,weight,min_price,max_price
0,Lenovo,15.6,1920,1080,glossy,1,Intel Core i7,0,Intel HD,Windows 10,8,0,1000,4.6,899.0,899.0
1,Razer,15.6,1920,1080,matte,0,Intel Core i7,0,NVIDIA GeForce RTX 2070 Max-Q,Windows 10 Home,16,512,512,4.63,2099.99,2099.99
2,HP,15.6,1366,768,matte,0,AMD A6,0,AMD Radeon R4,Windows 10,8,0,500,4.63,439.0,449.0
3,Acer,15.6,1920,1080,matte,0,Intel Core i3,0,Intel UHD 620,Windows 10 Home,6,0,1000,5.3,375.0,449.0
4,HP,17.3,1600,900,glossy,0,Intel Core i5,0,Intel HD 620,Windows 10,8,0,1000,5.8,559.0,559.0


In [12]:
# One-hot encoding
temp_df = most_important_features_df
# df_brand = pd.DataFrame({'brand':list(temp_df.brand.unique())})

# Encode glossy and matte as 1 and 0 respectively
for i in range(0,len(temp_df)):
    if temp_df.screen_surface[i] == 'glossy':
        temp_df.at[i,'screen_surface'] = 1
    else:
        temp_df.at[i,'screen_surface'] = 0

# pd.get_dummies(df_brand,prefix=['brand'])
brand_df = pd.get_dummies(temp_df['brand'],prefix='brand')
cpu_df = pd.get_dummies(temp_df['cpu'],prefix='cpu')
gpu_df = pd.get_dummies(temp_df['gpu'],prefix='gpu')
os_df = pd.get_dummies(temp_df['os_details'],prefix='os_details')

In [13]:
final_df = pd.concat([brand_df, cpu_df, gpu_df, os_df, temp_df], axis=1)

In [14]:
# Drop unnesecary columns
final_df = final_df.drop(['brand', 'cpu', 'gpu', 'os_details'], axis=1)

In [15]:
# Divide training set into inputs and targets
input_features = final_df[list(final_df.columns)[:-2]]
targets = final_df[list(final_df.columns)[-2:]]

# Divide dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(input_features, targets, test_size=0.20, random_state=42)

## Modeling & training

In [16]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=4500, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)

In [17]:
y_train=y_train.astype('int')

In [18]:
# Fit on training data
model.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 4500 out of 4500 | elapsed:   28.8s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=4500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=1,
                       warm_start=False)

In [19]:
n_nodes = []
max_depths = []

for ind_tree in model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')

Average number of nodes 505
Average maximum depth 29


## Testing the model

In [20]:
train_rf_predictions = model.predict(X_train)
train_rf_probs = model.predict_proba(X_train)

rf_predictions = model.predict(X_test)
rf_probs = model.predict_proba(X_test)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    4.8s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    8.2s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:   11.6s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:   15.4s
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:   20.9s
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:   24.6s
[Parallel(n_jobs=4)]: Done 4500 out of 4500 | elapsed:   26.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done 792 

In [48]:
 def evaluate(train_rf_predictions,y_train,rf_predictions,y_test): # Pass raw predictions to function
    pred_list_train = train_rf_predictions.tolist()
    target_list_train = y_train.values.tolist()

    pred_list_test = rf_predictions.tolist()
    target_list_test = y_test.values.tolist()

    print(f'Absolute mean square error on training set: {mean_absolute_error(target_list_train, pred_list_train)*2}')
    print(f'Absolute mean square error on test set: {mean_absolute_error(pred_list_test, target_list_test)*2}')
    return '-------------- Evaluated --------------'

# Extra caution:
sum_errors_max = 0
sum_errors_min = 0
for i in range(0,len(pred_list_test)):
    sum_errors_min += abs(pred_list_test[i][0]-target_list_test[i][0])
    sum_errors_max += abs(pred_list_test[i][1]-target_list_test[i][1])

test_result = (sum_errors_min + sum_errors_max) / len(pred_list_test)
test_result

357.5257843137254

In [22]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)

In [29]:
rf_predictions = dt.predict(X_test)
train_rf_predictions = dt.predict(X_train)

print(evaluate(train_rf_predictions,y_train,rf_predictions,y_test))

Absolute mean square error on training set: 4.877450980392156
Absolute mean square error on test set: 455.26715686274497
Evaluated


In [46]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=4500, 
                               max_features = 'sqrt',
                               n_jobs=-1, verbose = 1)
rf.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 4500 out of 4500 | elapsed:   12.8s finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=4500, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=1, warm_start=False)

In [47]:
rf_predictions = rf.predict(X_test)
train_rf_predictions = rf.predict(X_train)

print(evaluate(train_rf_predictions,y_train,rf_predictions,y_test))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:    0.7s
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 4500 out of 4500 | elapsed:    1.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 792 

Absolute mean square error on training set: 104.8806179421235
Absolute mean square error on test set: 350.520734526403
-------------- 
  Evaluated 
--------------
