# Results
This notebook is to compare the results of predicting min price and max price of the different techniques: random forest, gradient boosting, random forest with log transforming target variable, and gradient boosting with log transforming target variable. Random forest with log transforming target variable turns out to be the best.

To better understand this notebook take a look at the other notebooks to understand how EDA, feature engineering, missing data and hyperparameter tuning is done. Other notebook contain plots and explanation.
## Load data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_absolute_error
df =pd.read_csv('train.csv')




## Feature engineering
Do all manipulations on the data.

In [2]:
# Drop rows where ''screen_surface', 'weight' and 'detachable_keyboard' has a missing value
df.dropna(subset=['screen_surface'], how='any', inplace=True)
df.dropna(subset=['detachable_keyboard'], how='any', inplace=True)
df.dropna(subset=['weight'], how='any', inplace=True)


df=df.drop(columns=['pixels_y'])
df=df.drop(columns=['name'])
df=df.drop(columns=[ 'os_details'])
df=df.drop(columns=[ 'cpu_details'])
df=df.drop(columns=[ 'base_name'])
df=df.drop(columns=[ 'gpu'])
df=df.drop(columns=[ 'id'])


countries = df['brand']
country_counts = countries.value_counts()
mask = df['brand'].isin(country_counts[country_counts < 9].index)
# Label all other categories as Other
df['brand'][mask] = 'Other1'

countries = df['cpu']
country_counts = countries.value_counts()
mask = df['cpu'].isin(country_counts[country_counts < 9].index.drop('Intel Core i9'))
# Label all other categories as Other1
df['cpu'][mask] = 'Other1'
mask=df['cpu'].isin(['Intel Pentium','AMD A6'])
df['cpu'][mask] = 'Other1'


df['screen_surface']=df['screen_surface'].str.lower()
df=pd.get_dummies(df, columns=['screen_surface'], drop_first=True, prefix='DM')
df=pd.get_dummies(df, columns=['os'], drop_first=True, prefix='DM')
df=pd.get_dummies(df, columns=['brand'], drop_first=True, prefix='DM')
df=pd.get_dummies(df, columns=['cpu'], drop_first=True, prefix='DM')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['brand'][mask] = 'Other1'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cpu'][mask] = 'Other1'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cpu'][mask] = 'Other1'


## Max price

In [22]:
X=df.drop(columns=['min_price', 'max_price'])
y_log=df['max_price'].apply(np.log) ##take logtransformation of max price
y=df['max_price']
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
n=5
score=np.zeros((8, n))
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=37+i)
    
    
    ##Random forest
    
    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'max_depth': [3,5,9], 'max_features': [4,6,8,12], 'random_state': [42]} 
    # Instantiate rfr
    rfr = RandomForestRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    rfr_cv = RandomizedSearchCV(rfr, grid,n_iter=10, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    rfr_cv.fit(X_train,y_train)

    rfr.set_params(**rfr_cv.best_params_)
    rfr.fit(X_train,y_train)
    train_predictions = rfr.predict(X_train)
    test_predictions = rfr.predict(X_test)
    
    score[0][i]=mean_absolute_error(y_train,  train_predictions)
    score[1][i]=mean_absolute_error(y_test,  test_predictions)
    
    ## Gradient boost

    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'learning_rate': [0.01,0.05,0.1], 'max_features': [4,6,8,12], 'subsample':[0.3,0.6,0.8], 'random_state': [42]}
    # Instantiate gbr
    gbr = GradientBoostingRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    gbr_cv = RandomizedSearchCV(gbr, grid,n_iter=25, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    gbr_cv.fit(X_train,y_train)

    gbr.set_params(**gbr_cv.best_params_)
    gbr.fit(X_train,y_train)
    train_predictions = gbr.predict(X_train)
    test_predictions = gbr.predict(X_test)

    score[2][i]=mean_absolute_error(y_train,  train_predictions)
    score[3][i]=mean_absolute_error(y_test,  test_predictions)
    
    
    
    
    #with log(target)------------------------------------------------------------------------------------
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.20, random_state=37+i)
    ##Random forrest
    
    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'max_depth': [3,5,9], 'max_features': [4,6,8,12], 'random_state': [42]} 
    # Instantiate rfr
    rfr = RandomForestRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    rfr_cv = RandomizedSearchCV(rfr, grid, n_iter=10, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    rfr_cv.fit(X_train,y_train)

    rfr.set_params(**rfr_cv.best_params_)
    rfr.fit(X_train,y_train)
    train_predictions = np.exp(rfr.predict(X_train))
    test_predictions = np.exp(rfr.predict(X_test))
    
    score[4][i]=mean_absolute_error(np.exp(y_train),  train_predictions)
    score[5][i]=mean_absolute_error(np.exp(y_test),  test_predictions)
    
    ## Gradient boost

    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'learning_rate': [0.01,0.05,0.1], 'max_features': [12,6,8,4], 'subsample':[0.3,0.6,0.8], 'random_state': [42]}
    # Instantiate gbr
    gbr = GradientBoostingRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    gbr_cv = RandomizedSearchCV(gbr, grid, n_iter=25, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    gbr_cv.fit(X_train,y_train)

    gbr.set_params(**gbr_cv.best_params_)
    gbr.fit(X_train,y_train)
    train_predictions = np.exp(gbr.predict(X_train))
    test_predictions = np.exp(gbr.predict(X_test))

    score[6][i]=mean_absolute_error(np.exp(y_train),  train_predictions)
    score[7][i]=mean_absolute_error(np.exp(y_test),  test_predictions)




In [15]:
score

array([[ 96.21530715,  96.54981008,  99.35355761,  96.67432904,
         97.19023736],
       [159.41551309, 155.15677492, 128.39322473, 160.77959469,
        156.10788101],
       [135.73071167, 134.03144477, 136.71586033, 135.1800637 ,
        117.24037706],
       [171.66411048, 172.55832311, 149.7940902 , 172.06260003,
        163.63791514],
       [ 99.50115818, 100.3992163 , 100.98304484,  97.48573268,
         97.45134849],
       [161.58549722, 147.79263493, 124.50559616, 153.85525307,
        154.60177412],
       [114.22098295, 118.83154195, 122.13275252, 102.36168284,
        116.26530484],
       [163.61769414, 159.47443002, 135.70153059, 165.06251604,
        159.55901825]])

In [23]:
scoresdf=pd.DataFrame(data=score, index=['rfr_train', 'rfr_test',"gbr_train", "gbr_test",'rfr_train_log',\
                                'rfr_test_log',"gbr_train_log", "gbr_test_log"], columns=["rs37", "rs38","rs39","rs40","rs41"])

In [25]:
scoresdf.mean(axis=1)

rfr_train         98.961388
rfr_test         152.893670
gbr_train        124.878027
gbr_test         163.494866
rfr_train_log     95.925332
rfr_test_log     149.321338
gbr_train_log    112.395985
gbr_test_log     158.718170
dtype: float64

We only have to look on how well the regressors perform on the test set. So Random forest with first log transform max price performs best: the mean MAE over 5 different random splits in test and train data is 149.

## Min price

In [26]:
df.drop(columns=['min_price', 'max_price'])
y_log=df['min_price'].apply(np.log) ##take logtransformation of max price
y=df['min_price']
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
n=3
score=np.zeros((8, n))
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=37+i)
    
    
    ##Random forest
    
    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'max_depth': [3,5,9], 'max_features': [4,6,8,12], 'random_state': [42]} 
    # Instantiate rfr
    rfr = RandomForestRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    rfr_cv = RandomizedSearchCV(rfr, grid,n_iter=10, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    rfr_cv.fit(X_train,y_train)

    rfr.set_params(**rfr_cv.best_params_)
    rfr.fit(X_train,y_train)
    train_predictions = rfr.predict(X_train)
    test_predictions = rfr.predict(X_test)
    
    score[0][i]=mean_absolute_error(y_train,  train_predictions)
    score[1][i]=mean_absolute_error(y_test,  test_predictions)
    
    ## Gradient boost

    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'learning_rate': [0.01,0.05,0.1], 'max_features': [4,6,8,12], 'subsample':[0.3,0.6,0.8], 'random_state': [42]}
    # Instantiate gbr
    gbr = GradientBoostingRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    gbr_cv = RandomizedSearchCV(gbr, grid,n_iter=20, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    gbr_cv.fit(X_train,y_train)

    gbr.set_params(**gbr_cv.best_params_)
    gbr.fit(X_train,y_train)
    train_predictions = gbr.predict(X_train)
    test_predictions = gbr.predict(X_test)

    score[2][i]=mean_absolute_error(y_train,  train_predictions)
    score[3][i]=mean_absolute_error(y_test,  test_predictions)
    
    
    
    
    #with log(target)------------------------------------------------------------------------------------
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.20, random_state=37+i)
    ##Random forrest
    
    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'max_depth': [3,5,9], 'max_features': [4,6,8,12], 'random_state': [42]} 
    # Instantiate rfr
    rfr = RandomForestRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    rfr_cv = RandomizedSearchCV(rfr, grid, n_iter=10, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    rfr_cv.fit(X_train,y_train)

    rfr.set_params(**rfr_cv.best_params_)
    rfr.fit(X_train,y_train)
    train_predictions = np.exp(rfr.predict(X_train))
    test_predictions = np.exp(rfr.predict(X_test))
    
    score[4][i]=mean_absolute_error(np.exp(y_train),  train_predictions)
    score[5][i]=mean_absolute_error(np.exp(y_test),  test_predictions)
    
    ## Gradient boost

    # Setup the hyperparameter grid
    grid = {'n_estimators': [200], 'learning_rate': [0.01,0.05,0.1], 'max_features': [12,6,8,4], 'subsample':[0.3,0.6,0.8], 'random_state': [42]}
    # Instantiate gbr
    gbr = GradientBoostingRegressor()
    # Instantiate the RandomizedSearchCV object: gbr_cv
    gbr_cv = RandomizedSearchCV(gbr, grid, n_iter=20, cv=5, scoring='neg_mean_absolute_error')
    # Fit it to the data
    gbr_cv.fit(X_train,y_train)

    gbr.set_params(**gbr_cv.best_params_)
    gbr.fit(X_train,y_train)
    train_predictions = np.exp(gbr.predict(X_train))
    test_predictions = np.exp(gbr.predict(X_test))

    score[6][i]=mean_absolute_error(np.exp(y_train),  train_predictions)
    score[7][i]=mean_absolute_error(np.exp(y_test),  test_predictions)

In [28]:
scoresdf=pd.DataFrame(data=score, index=['rfr_train', 'rfr_test',"gbr_train", "gbr_test",'rfr_train_log',\
                                'rfr_test_log',"gbr_train_log", "gbr_test_log"], columns=["rs37", "rs38","rs39"])
scoresdf

Unnamed: 0,rs37,rs38,rs39
rfr_train,94.418813,89.892766,95.731857
rfr_test,153.856743,156.508796,131.294116
gbr_train,121.330137,107.256286,135.334284
gbr_test,159.682048,170.31447,140.969743
rfr_train_log,91.977888,93.302255,92.094036
rfr_test_log,151.707037,151.240092,124.896922
gbr_train_log,111.660942,101.731656,94.950184
gbr_test_log,160.208018,159.247034,135.390251


In [29]:
scoresdf.mean(axis=1)

rfr_train         93.347812
rfr_test         147.219885
gbr_train        121.306902
gbr_test         156.988754
rfr_train_log     92.458060
rfr_test_log     142.614684
gbr_train_log    102.780927
gbr_test_log     151.615101
dtype: float64

We only have to look on how well the regressors perform on the test set. So Random forest with first log transform min price performs best: the mean MAE over 3 different random splits in test and train data is 143.

### Conclusion: average MAE=292