In [2]:
# packages

# data manipulation
import math
import pandas as pd                                                            # table manipulation
from pandas.api.types import is_numeric_dtype                                  # it checks if pandas column is numeric
import numpy as np                                                             # package for scientific computing

# machine learning
from sklearn.model_selection import train_test_split                           # splits the data into a train and test data set
from sklearn.metrics import mean_squared_error                                 # root mean square error
from sklearn.metrics import fbeta_score, make_scorer                           # custom score function
from sklearn.preprocessing import StandardScaler

In [3]:
# FUNCTIONS

# this functions transforms non-numeric values into a dummy coding:
def make_dummies(dat): 
    
    ''' note: this funciton does not consider ordinal variables '''
    
    df = pd.DataFrame()
    df.index = dat.index

    for col in dat.columns:

        if is_numeric_dtype(dat[col]):
            
            df = df.merge(
                dat[col].fillna(0), 
                left_index=True, 
                right_index=True
                )
        else:  
            df = df.merge(
                pd.get_dummies(dat[col], drop_first=True, prefix = col, dtype=int), 
                left_index=True, 
                right_index=True
            ) 
    return df


# test, train data split
def make_split(dt, target):
    X_train, X_test, y_train, y_test = train_test_split( 
        dt.drop([target],axis=1),                                       
        dt[target],                                                          
        test_size=0.3, 
        train_size=0.7
    )
    
    return X_train, X_test, y_train, y_test      

# this function calculates the root-mean-squared-error (RMSE)
def get_RMSE(y_true, y_pred):
    '''
    metric: root-mean-squared-Error (rmse) between the log. of the predicted value and 
    the logarithm of the observed sales price.
    '''
    y_trug_log = np.log(np.abs(y_true))
    y_pred_log = np.log(np.abs(y_pred))
    
    return math.sqrt(mean_squared_error(y_trug_log, y_pred_log))