In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import re

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.svm import SVR

In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

## ML preeliminary testing

#### Create a helper class

In [3]:
class trainAndEvaluate:
    def __init__(self,df,model):
        self.df = df
        self.model = model
        self.model_name = re.findall(r'(?<=\.)[a-zA-Z]+(?=\')',str(self.model.__class__))[0]
        self.X = self.df[[col for col in self.df.columns if col != 'price']]
        self.y = self.df['price']
        

    def KFold_score(self, n_folds = 10):
        
        print('___________________________________________')
        print(f'{n_folds}-CV Results for {self.model_name} model.')
        print('___________________________________________')
        
        cv = KFold(n_splits = n_folds,
                   random_state = 1,
                   shuffle = True) 
        
        # RMSE handling for log
        rmse = 'neg_root_mean_squared_error' 
        rmse_test = 'test_neg_root_mean_squared_error' 
        scores = cross_validate(self.model,
                                 self.X,
                                 self.y,
                                 scoring = ['neg_root_mean_squared_error','r2'],
                                 cv = cv,
                                 n_jobs= - 1)
        
        R2 = scores['test_r2'].mean()
        NRMSE = np.abs(scores[rmse_test]).mean()/(self.y.std())
        
        print(f'R^2 = {R2}')
        print(f'NRMSE = {NRMSE}')
    
    def regression_eval_nonn(self,
                             show_results = 1,
                             plot = 1,
                             target_log = 0):
        
    
        train_input,test_input,train_output,test_output  = train_test_split(self.X,self.y)
    
        self.model.fit(train_input, train_output)
        y_pred = self.model.predict(test_input)

         # extract line of best fit statistics
        slope, intercept, r_value, p_value, std_err = stats.linregress(test_output,y_pred)

#         MAE = mean_absolute_error(test_output, y_pred)
#         MSE = mean_squared_error(test_output, y_pred, squared = 1)
#         EV = explained_variance_score(test_output, y_pred)
#         RMSE = mean_squared_error(test_output, y_pred, squared = 0)
        
        N_RMSE = (mean_squared_error(test_output, y_pred, squared = 0))/(test_output.std()) 
        
        R2 = r2_score(test_output, y_pred)


        # Extract name from model __class__
        model_name = re.findall(r'(?<=\.)[a-zA-Z]+(?=\')',str(self.model.__class__))[0]

        if show_results:
            print('---------------')
            print(f'{self.model_name} Results: ')
            print('---------------')
#             print(f'MAE = {MAE:.2f} AED.')
#             print(f'MSE = {MSE:.2f} AED.')
#             print(f'RMSE = {RMSE:.2f} AED.')

            print(f'NRMSE = {N_RMSE:.2f} AED.')
            print(f'R^2 score = {R2:.2f}')
            print(f'EV = {EV:.2f}')
            if plot:
                print('\nRegression Line Metrics: ')
                print('---------------------------')
                print(f'r = {r_value:.2f}, p = {p_value:.2f}, err = {std_err:.2f}')
                # plot A vs. P plot with regression line
                sns.regplot(x = test_output, y = y_pred, line_kws = {'label': f' y = {slope:.1f} x + {intercept:.1f}'})
                plt.legend()
                plt.title(f'{self.model_name}')
                plt.xlabel('Actual Price (AED)')
                plt.ylabel('Predicted Price (AED)')
                plt.show()
        else:
            return ("""model_name,MAE,MSE,RMSE,""",N_RMSE,R2)

### Test with training data and some models

In [4]:
# Will iterate and inspect initial results from these models
models = [RandomForestRegressor(),
          XGBRegressor(),
          LinearRegression(),
          DecisionTreeRegressor(),
          KNeighborsRegressor(),
          AdaBoostRegressor(),
          ElasticNet(),
          Lasso(),
          SVR()]

### Normalized data, Untouched Output

In [5]:
df_norm = pd.read_csv('datasets/training/train_norm.csv')
df_norm.head()

Unnamed: 0,number,price,num_digits,unique_digits,num_zeros,max_group,max_group_second,shan_entrop,lv_dist,num_ones,...,15,16,17,18,19,20,21,22,23,24
0,0.012519,109000,0.666667,0.25,0.0,0.25,0.25,0.430677,0.141237,0.4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.031622,72000,0.666667,0.25,0.0,0.5,0.0,0.349398,0.532474,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.299923,24000,1.0,0.25,0.5,0.5,0.25,0.418166,0.785567,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.077085,17500,0.666667,0.5,0.0,0.25,0.0,0.646015,0.229897,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.088203,17500,0.666667,0.5,0.0,0.25,0.0,0.646015,0.286082,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [6]:
for model in models:
    trainAndEvaluate(df_norm,model).KFold_score()

___________________________________________
10-CV Results for RandomForestRegressor model.
___________________________________________
R^2 = 0.7294875742643164
NRMSE = 0.4331346205830073
___________________________________________
10-CV Results for XGBRegressor model.
___________________________________________
R^2 = -0.01843298081893463
NRMSE = 0.5179895307083499
___________________________________________
10-CV Results for LinearRegression model.
___________________________________________
R^2 = 0.42013492420798526
NRMSE = 0.5534962538569099
___________________________________________
10-CV Results for DecisionTreeRegressor model.
___________________________________________
R^2 = 0.4326125334098167
NRMSE = 0.46952053306216246
___________________________________________
10-CV Results for KNeighborsRegressor model.
___________________________________________
R^2 = 0.4157210364977292
NRMSE = 0.6124875328890711
___________________________________________
10-CV Results for AdaBoostRegress

### Robust scaled data, untouched output

In [7]:
df_rob = pd.read_csv('datasets/training/train_rob.csv')
df_rob.head()

Unnamed: 0,number,price,num_digits,unique_digits,num_zeros,max_group,max_group_second,shan_entrop,lv_dist,num_ones,...,15,16,17,18,19,20,21,22,23,24
0,-1.246857,109000,-1.0,-1.0,0.0,0.0,1.0,-3.456993,-2.777321,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-1.202741,72000,-1.0,-1.0,0.0,1.0,0.0,-4.706993,-0.282662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.583148,24000,0.0,-1.0,2.0,1.0,1.0,-3.649401,1.331142,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.097754,17500,-1.0,0.0,0.0,0.0,0.0,-0.145241,-2.211997,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.072079,17500,-1.0,0.0,0.0,0.0,0.0,-0.145241,-1.853739,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
for model in models:
    trainAndEvaluate(df_rob,model).KFold_score()

___________________________________________
10-CV Results for RandomForestRegressor model.
___________________________________________
R^2 = 0.6821643577864883
NRMSE = 0.44269561217911574
___________________________________________
10-CV Results for XGBRegressor model.
___________________________________________
R^2 = -0.01843299125345188
NRMSE = 0.5179895465066684
___________________________________________
10-CV Results for LinearRegression model.
___________________________________________
R^2 = 0.4195753055848675
NRMSE = 0.5537531218838352
___________________________________________
10-CV Results for DecisionTreeRegressor model.
___________________________________________
R^2 = -0.11388126951186373
NRMSE = 0.568680856474789
___________________________________________
10-CV Results for KNeighborsRegressor model.
___________________________________________
R^2 = 0.6087562679375931
NRMSE = 0.5456561984534399
___________________________________________
10-CV Results for AdaBoostRegress

### Normalized data, log transformed output

In [9]:
df_norm_log = pd.read_csv('datasets/training/train_normlog.csv')
df_norm_log.head()

Unnamed: 0,number,price,num_digits,unique_digits,num_zeros,max_group,max_group_second,shan_entrop,lv_dist,num_ones,...,15,16,17,18,19,20,21,22,23,24
0,0.012519,11.599112,0.666667,0.25,0.0,0.25,0.25,0.430677,0.141237,0.4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.031622,11.184435,0.666667,0.25,0.0,0.5,0.0,0.349398,0.532474,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.299923,10.085851,1.0,0.25,0.5,0.5,0.25,0.418166,0.785567,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.077085,9.770013,0.666667,0.5,0.0,0.25,0.0,0.646015,0.229897,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.088203,9.770013,0.666667,0.5,0.0,0.25,0.0,0.646015,0.286082,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [10]:
for model in models:
    trainAndEvaluate(df_norm_log,model).KFold_score()

___________________________________________
10-CV Results for RandomForestRegressor model.
___________________________________________
R^2 = 0.8351284586711527
NRMSE = 0.39373912752031726
___________________________________________
10-CV Results for XGBRegressor model.
___________________________________________
R^2 = 0.8252978961794588
NRMSE = 0.4058031443606025
___________________________________________
10-CV Results for LinearRegression model.
___________________________________________
R^2 = 0.7911524801487733
NRMSE = 0.45205498153208157
___________________________________________
10-CV Results for DecisionTreeRegressor model.
___________________________________________
R^2 = 0.7145442064450671
NRMSE = 0.5071168658423454
___________________________________________
10-CV Results for KNeighborsRegressor model.
___________________________________________
R^2 = 0.6431625076251886
NRMSE = 0.5949591336606506
___________________________________________
10-CV Results for AdaBoostRegressor

### Robust data, log transformed output

In [11]:
df_rob_log = pd.read_csv('datasets/training/train_rob_log.csv')
df_rob_log.head()

Unnamed: 0,number,price,num_digits,unique_digits,num_zeros,max_group,max_group_second,shan_entrop,lv_dist,num_ones,...,15,16,17,18,19,20,21,22,23,24
0,-1.246857,11.599112,-1.0,-1.0,0.0,0.0,1.0,-3.456993,-2.777321,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-1.202741,11.184435,-1.0,-1.0,0.0,1.0,0.0,-4.706993,-0.282662,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.583148,10.085851,0.0,-1.0,2.0,1.0,1.0,-3.649401,1.331142,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.097754,9.770013,-1.0,0.0,0.0,0.0,0.0,-0.145241,-2.211997,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.072079,9.770013,-1.0,0.0,0.0,0.0,0.0,-0.145241,-1.853739,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
for model in models:
    trainAndEvaluate(df_rob_log,model).KFold_score()

___________________________________________
10-CV Results for RandomForestRegressor model.
___________________________________________
R^2 = 0.8384695773381822
NRMSE = 0.3910570534745197
___________________________________________
10-CV Results for XGBRegressor model.
___________________________________________
R^2 = 0.8252994596445482
NRMSE = 0.4057954689399343
___________________________________________
10-CV Results for LinearRegression model.
___________________________________________
R^2 = 0.78999334150321
NRMSE = 0.4532246373364897
___________________________________________
10-CV Results for DecisionTreeRegressor model.
___________________________________________
R^2 = 0.712248996633372
NRMSE = 0.5096356149173134
___________________________________________
10-CV Results for KNeighborsRegressor model.
___________________________________________
R^2 = 0.7743075656979255
NRMSE = 0.4696463328284402
___________________________________________
10-CV Results for AdaBoostRegressor mode

### Removing Outliers and testing

In [13]:
df_rob.loc[df_rob['price'] < 50000].reset_index(drop = True)
## barely any samples are lost by reducing the range og the price by multiple orders of magnitude.

Unnamed: 0,number,price,num_digits,unique_digits,num_zeros,max_group,max_group_second,shan_entrop,lv_dist,num_ones,...,15,16,17,18,19,20,21,22,23,24
0,-0.583148,24000,0.0,-1.0,2.0,1.0,1.0,-3.649401,1.331142,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.097754,17500,-1.0,0.0,0.0,0.0,0.0,-0.145241,-2.211997,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.072079,17500,-1.0,0.0,0.0,0.0,0.0,-0.145241,-1.853739,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-1.250716,16500,-1.0,0.0,0.0,0.0,0.0,-0.145241,-2.146261,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.213787,12900,-1.0,0.0,0.0,0.0,0.0,-0.145241,-1.804437,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3719,-1.110580,9200,-1.0,1.0,1.0,-1.0,0.0,3.166511,-2.080526,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3720,-1.130385,15000,-1.0,0.0,0.0,0.0,0.0,-0.145241,-2.948233,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3721,0.829959,6500,0.0,0.0,0.0,1.0,0.0,-1.000000,0.262942,3.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3722,-1.076054,6600,-1.0,1.0,1.0,-1.0,0.0,3.166511,-2.350041,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Robust scaled dataframe with outlier prices removed.
df_olr = df_rob.loc[df_rob['price'] < 1000000].reset_index(drop = True)

# log shift the price
df_olr['price'] = np.log1p(df_olr['price'])

# Chefk for improvements
for model in models:
    trainAndEvaluate(df_olr,model).KFold_score()
    
# It seems removing outlier prices doesn't improve the models predictive abilities for robust scaled data

___________________________________________
10-CV Results for RandomForestRegressor model.
___________________________________________
R^2 = 0.8323139770005463
NRMSE = 0.39905716098634175
___________________________________________
10-CV Results for XGBRegressor model.
___________________________________________
R^2 = 0.8250300148511821
NRMSE = 0.41102975664011365
___________________________________________
10-CV Results for LinearRegression model.
___________________________________________
R^2 = 0.785464147527853
NRMSE = 0.45747142880879843
___________________________________________
10-CV Results for DecisionTreeRegressor model.
___________________________________________
R^2 = 0.741017336095559
NRMSE = 0.4996958113009852
___________________________________________
10-CV Results for KNeighborsRegressor model.
___________________________________________
R^2 = 0.7627621403017533
NRMSE = 0.4793737487665334
___________________________________________
10-CV Results for AdaBoostRegressor 

In [21]:
# Normal scaled dataframe with outlier prices removed.
df_olnr = df_norm.loc[df_norm['price'] < 500000].reset_index(drop = True)

# log shift the price
df_olnr['price'] = np.log1p(df_olnr['price'])

# Chefk for improvements
for model in models:
    trainAndEvaluate(df_olnr,model).KFold_score()
    
# No improvements for normalized data as well
# Dropping outliers is not necessary.

___________________________________________
10-CV Results for RandomForestRegressor model.
___________________________________________
R^2 = 0.817280740329488
NRMSE = 0.41194663096131534
___________________________________________
10-CV Results for XGBRegressor model.
___________________________________________
R^2 = 0.8097603666598718
NRMSE = 0.4221183168015495
___________________________________________
10-CV Results for LinearRegression model.
___________________________________________
R^2 = 0.7794301174486481
NRMSE = 0.4564242286958275
___________________________________________
10-CV Results for DecisionTreeRegressor model.
___________________________________________
R^2 = 0.6860158933615798
NRMSE = 0.5482434633010272
___________________________________________
10-CV Results for KNeighborsRegressor model.
___________________________________________
R^2 = 0.6014371196433156
NRMSE = 0.6283953620146462
___________________________________________
10-CV Results for AdaBoostRegressor m