In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import re

from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.svm import SVR

In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999

<IPython.core.display.Javascript object>

## ML preeliminary testing

#### Create a helper class

In [54]:
class trainAndEvaluate:
    def __init__(self,df,model):
        self.df = df
        self.model = model
        self.model_name = re.findall(r'(?<=\.)[a-zA-Z]+(?=\')',str(self.model.__class__))[0]
        self.X = self.df[[col for col in self.df.columns if col != 'price']]
        self.y = self.df['price']
        

    def KFold_score(self, n_folds = 10):
        
        print(f'___________________________________________')
        print(f'{n_folds}-CV Results for {self.model_name} model.')
        print(f'___________________________________________')
        
        cv = KFold(n_splits = n_folds,
                   random_state = 1,
                   shuffle = True) 
        
        scores = cross_validate(self.model,
                                 self.X,
                                 self.y,
                                 scoring = ['neg_root_mean_squared_error','r2'],
                                 cv = cv,
                                 n_jobs= - 1)
        
        R2 = scores['test_r2'].mean()
        NRMSE = np.abs(scores['test_neg_root_mean_squared_error']).mean()/(self.y.std())
        
        print(f'R^2 = {R2}')
        print(f'NRMSE = {NRMSE}')
    
    def regression_eval_nonn(self,
                             show_results = 1,
                             plot = 1,
                             target_log = 0):
        
    
        train_input,test_input,train_output,test_output  = train_test_split(self.X,self.y)
    
        self.model.fit(train_input, train_output)
        y_pred = self.model.predict(test_input)

         # extract line of best fit statistics
        slope, intercept, r_value, p_value, std_err = stats.linregress(test_output,y_pred)

        MAE = mean_absolute_error(test_output, y_pred)
        MSE = mean_squared_error(test_output, y_pred, squared = 1)
        RMSE = mean_squared_error(test_output, y_pred, squared = 0)
        N_RMSE = (mean_squared_error(test_output, y_pred, squared = 0))/(test_output.std())
        R2 = r2_score(test_output, y_pred)
        EV = explained_variance_score(test_output, y_pred)

        # Extract name from model __class__
        model_name = re.findall(r'(?<=\.)[a-zA-Z]+(?=\')',str(self.model.__class__))[0]

        if show_results:
            print('---------------')
            print(f'{self.model_name} Results: ')
            print('---------------')
#             print(f'MAE = {MAE:.2f} AED.')
#             print(f'MSE = {MSE:.2f} AED.')
#             print(f'RMSE = {RMSE:.2f} AED.')

            print(f'NRMSE = {N_RMSE:.2f} AED.')
            print(f'R^2 score = {R2:.2f}')
            print(f'EV = {EV:.2f}')
            if plot:
                print('\nRegression Line Metrics: ')
                print('---------------------------')
                print(f'r = {r_value:.2f}, p = {p_value:.2f}, err = {std_err:.2f}')
                # plot A vs. P plot with regression line
                sns.regplot(x = test_output, y = y_pred, line_kws = {'label': f' y = {slope:.1f} x + {intercept:.1f}'})
                plt.legend()
                plt.title(f'{self.model_name}')
                plt.xlabel('Actual Price (AED)')
                plt.ylabel('Predicted Price (AED)')
                plt.show()
        else:
            return ("""model_name,MAE,MSE,RMSE,""",N_RMSE,R2)

### Test with training data and some models

In [55]:
# Will iterate and inspect initial results from these models
models = [RandomForestRegressor(),
          XGBRegressor()]

### Normalized data, Untouched Output

In [56]:
df_norm = pd.read_csv('datasets/training/train_norm.csv')

In [57]:
df_norm.head()

Unnamed: 0,number,price,num_digits,unique_digits,num_zeros,max_group,max_group_second,shan_entrop,lv_dist,num_ones,...,15,16,17,18,19,20,21,22,23,24
0,0.012519,109000,0.666667,0.25,0.0,0.25,0.25,0.430677,0.141237,0.4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.031622,72000,0.666667,0.25,0.0,0.5,0.0,0.349398,0.532474,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.299923,24000,1.0,0.25,0.5,0.5,0.25,0.418166,0.785567,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.077085,17500,0.666667,0.5,0.0,0.25,0.0,0.646015,0.229897,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.088203,17500,0.666667,0.5,0.0,0.25,0.0,0.646015,0.286082,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [58]:
for model in models:
    trainAndEvaluate(df_norm,model).KFold_score()

__________________________________________________
10-CV Results for RandomForestRegressor model.
R^2 = 0.7317549478513011
NRMSE = 0.4294046664567432
__________________________________________________
10-CV Results for XGBRegressor model.
R^2 = -0.01843298081893463
NRMSE = 0.5179895307083499
