In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

In [33]:
def read_data():
    '''Import data from the data directory

    Inputs: None
    Returns: df. Pandas dataframe
    '''
    return pd.read_csv("../data/dev_data_nyc_airbnb.csv")

df = read_data()
print(df.shape)
# print(df.head(5))    

(46427, 16)


In [38]:
# Extract features and labels
y = df['price']
X = df[['minimum_nights','number_of_reviews','availability_365','reviews_per_month']]

# Fill missings with mean
my_imputer = SimpleImputer(strategy="mean")
X = my_imputer.fit_transform(X)

# Initialise the model (Random forest)
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X,y)

# Score the training data
y_pred = rf_model.predict(X)

In [79]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

def perf_mets(
        model_type = "reg", 
        y_train = None, y_train_p = None, 
        y_valid = None, y_valid_p = None,
        y_test = None, y_test_p = None
):
    '''Get the performance metrics for a model
    
    input:
        model_type: Str (default = "reg"). Type of model {reg, binary_class, "multi_class"}
        y_train: Pandas series or numpy array (default = None). Traing data labels
        y_train_p: Pandas series or numpy array (default = None). Training data predictions
        y_valid: Pandas series or numpy array (default = None). Validation data label
        y_valid_p: Pandas series or numpy array (default = None). Validation data predictions            
        y_test: Pandas series or numpy array (default = None). Validation data labels
        y_test_p: Pandas series or numpy array (default = None). Training data predictions
        
        In the case of a regressor the iterables represent probabilities
        In the case of a classifier (binary or multileve) the iterables represent class memberships 
        
    output:
        None
    '''
    print("Headline perforemance metrics")

    if model_type == "reg":        
        print("Model R2")
        if y_train is not None and y_train_p is not None: 
            print(f"\tTrain:\t\t{r2_score(y_train, y_train_p):.4f}")
        if y_valid is not None and y_valid_p is not None: 
            print(f"\tValidation:\t{r2_score(y_valid, y_valid_p):.4f}")
        if y_test is not None and y_test_p is not None: 
            print(f"\tTest:\t\t{r2_score(y_test, y_test_p):.4f}")
            
        print("MAE")
        if y_train is not None and y_train_p is not None: 
            print(f"\tTrain:\t\t{mean_absolute_error(y_train, y_train_p):.4f}")
        if y_valid is not None and y_valid_p is not None: 
            print(f"\tValidation:\t{mean_absolute_error(y_valid, y_valid_p):.4f}")
        if y_test is not None and y_test_p is not None: 
            print(f"\tTest:\t\t{mean_absolute_error(y_test, y_test_p):.4f}")
            
        print("MSE")
        if y_train is not None and y_train_p is not None: 
            print(f"\tTrain:\t\t{mean_squared_error(y_train, y_train_p):.4f}")
        if y_valid is not None and y_valid_p is not None: 
            print(f"\tValidation:\t{mean_squared_error(y_valid, y_valid_p):.4f}")
        if y_test is not None and y_test_p is not None: 
            print(f"\tTest:\t\t{mean_squared_error(y_test, y_test_p):.4f}")
        
        print("RMSE")
        if y_train is not None and y_train_p is not None: 
            print(f"\tTrain:\t\t{np.sqrt(mean_squared_error(y_train, y_train_p)):.4f}")
        if y_valid is not None and y_valid_p is not None: 
            print(f"\tValidation:\t{np.sqrt(mean_squared_error(y_valid, y_valid_p)):.4f}")
        if y_test is not None and y_test_p is not None: 
            print(f"\tTest:\t\t{np.sqrt(mean_squared_error(y_test, y_test_p)):.4f}")
            
    
    if model_type == "binary_class":
        print("Binary classifier model")
        
        
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

train_accuracy = accuracy_score(y_train, y_train_pred_c)
test_accuracy = accuracy_score(y_test, y_test_pred_c)
train_precision = precision_score(y_train, y_train_pred_c)
test_precision = precision_score(y_test, y_test_pred_c)
train_recall = recall_score(y_train, y_train_pred_c)
test_recall = recall_score(y_test, y_test_pred_c)
train_f1 = f1_score(y_train, y_train_pred_c)
test_f1 = f1_score(y_test, y_test_pred_c)

    
        

    if model_type == "multi_class":
        print("Multi level classifier model")
        
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

Get the overall Accuracy 
overall_accuracy = np.trace(conf_mx) / np.sum(conf_mx)

Get the class level accuracy 
class_accuracy = np.diagonal(conf_mx) / conf_mx.sum(axis = 1))




In [80]:
perf_mets(
    y_train = y, y_train_p = y_pred
)

Regression model
Model R2
	Train:		0.6074
MAE
	Train:		31.9457
MSE
	Train:		2027.5826
RMSE
	Train:		45.0287


In [82]:
perf_mets(
    y_train = y, y_train_p = y_pred, 
    y_test = y, y_test_p = y_pred,
    y_valid = y, y_valid_p = y_pred
)

Regression model
Model R2
	Train:		0.6074
	Validation:	0.6074
	Test:		0.6074
MAE
	Train:		31.9457
	Validation:	31.9457
	Test:		31.9457
MSE
	Train:		2027.5826
	Validation:	2027.5826
	Test:		2027.5826
RMSE
	Train:		45.0287
	Validation:	45.0287
	Test:		45.0287
