In [116]:
import os
import sys

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from scipy.stats import randint

import seaborn as sns
import matplotlib.pyplot as plt

# Constants

In [117]:
RANDOM_STATE = 42 # to get the same random values with different .ipynb sessions
N_ITER = 3 # number of iterations while searching hyperparameters of the model
N_CV = 5 # number of folds in crossvalidation
N_JOBS = 2 # number of jobs for parallelization

In [118]:
PROJECT_PATH = os.getcwd() # get current working directory
PROJECT_PATH = os.path.join(PROJECT_PATH, "..") # go to higher folder 

DATA_PATH = os.path.join(PROJECT_PATH, "data") # define path to the data

In [119]:
sys.path.append(PROJECT_PATH) # add higher direcotory to load self-written metrics
from tools.metrics import * 

# Read Data

In [120]:
df = pd.read_csv(os.path.join(DATA_PATH, "train.csv"), encoding="cp1252",)
df.drop(columns=['id'], inplace=True)

In [121]:
df.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,cleaning_fee,host_response_rate,latitude,log_price,longitude,number_of_reviews,...,Waterfront,first_review_Year,first_review_Month,first_review_Day,last_review_Year,last_review_Month,last_review_Day,host_since_Year,host_since_Month,host_since_Day
0,3,1.0,1.0,1.0,1,0,40.696524,5.010635,-73.991617,2,...,0,2016.0,6.0,18.0,2016.0,7.0,18.0,2012.0,3.0,26.0
1,7,1.0,3.0,3.0,1,100,40.766115,5.129899,-73.98904,6,...,0,2017.0,8.0,5.0,2017.0,9.0,23.0,2017.0,6.0,19.0
2,5,1.0,1.0,3.0,1,100,40.80811,4.976734,-73.943756,10,...,0,2017.0,4.0,30.0,2017.0,9.0,14.0,2016.0,10.0,25.0
3,4,1.0,2.0,2.0,1,0,37.772004,6.620073,-122.431619,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,2015.0,4.0,19.0
4,2,1.0,0.0,1.0,1,100,38.925627,4.744932,-77.034596,4,...,0,2015.0,5.0,12.0,2017.0,1.0,22.0,2015.0,3.0,1.0


In [122]:
df.describe()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,cleaning_fee,host_response_rate,latitude,log_price,longitude,number_of_reviews,...,Waterfront,first_review_Year,first_review_Month,first_review_Day,last_review_Year,last_review_Month,last_review_Day,host_since_Year,host_since_Month,host_since_Day
count,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,...,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0,74111.0
mean,3.155146,1.231929,1.264239,1.707844,0.734075,71.055201,38.445958,4.782069,-92.397525,20.900568,...,0.001552,1584.095438,5.146469,12.129603,1586.010133,5.204814,13.178948,2008.941992,6.524268,15.595957
std,2.153589,0.58478,0.852773,1.255092,0.441828,43.087044,3.080167,0.717394,21.705322,37.828641,...,0.039362,826.712164,3.898664,10.152619,826.482416,3.707618,10.650738,101.328302,3.334992,8.809573
min,1.0,0.0,0.0,0.0,0.0,0.0,33.338905,0.0,-122.5115,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,1.0,1.0,1.0,0.0,0.0,34.127908,4.317488,-118.342374,1.0,...,0.0,2013.0,1.0,2.0,2015.0,1.0,1.0,2013.0,4.0,8.0
50%,2.0,1.0,1.0,1.0,1.0,100.0,40.662138,4.70953,-76.996965,6.0,...,0.0,2015.0,5.0,11.0,2017.0,5.0,14.0,2014.0,7.0,16.0
75%,4.0,1.0,1.0,2.0,1.0,100.0,40.746096,5.220356,-73.95466,23.0,...,0.0,2016.0,8.0,21.0,2017.0,9.0,23.0,2015.0,9.0,23.0
max,16.0,8.0,10.0,18.0,1.0,100.0,42.390437,7.600402,-70.985047,605.0,...,1.0,2017.0,12.0,31.0,2017.0,12.0,31.0,2017.0,12.0,31.0


# Exploratory Data Analysis

In [123]:
# define list of columns to train model
COLS_TO_TRAIN = []

# Descriptive statistics

In [124]:
# code for feature selection

# Generate train, test, inference

Data is split into 3 parts: 

    - train - 80% of observations
    
    - test - 20% of observations
    
    
Train will be used to train a model.

Test - to evaluate its performance.

In [125]:
targets = df['log_price']
features = df.drop(columns=['log_price'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(
    features, targets, train_size=0.8, random_state=RANDOM_STATE, shuffle=True
)

In [126]:
print("Check shapes of inputs to the model (train stage): ")
print("Features: ", x_train.shape)
print("Target: ", y_train.shape)

Check shapes of inputs to the model (train stage): 
Features:  (59288, 211)
Target:  (59288,)


In [127]:
print("Check shapes of inputs to the model (test stage): ")
print("Features: ", x_test.shape)
print("Target: ", y_test.shape)

Check shapes of inputs to the model (test stage): 
Features:  (14823, 211)
Target:  (14823,)


# Develop a model

In [128]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()

param_grid = { 
                "n_estimators": [10, 100, 200],
                "max_features": ["sqrt", "log2"],
                "bootstrap": [True, False]
             }

# initialize random search object
rfr = RandomizedSearchCV(rfr, param_grid, random_state=RANDOM_STATE, n_iter=N_ITER, cv=N_CV, n_jobs=N_JOBS)

# search for the optimal model's hyperparameters
search = rfr.fit(x_train, y_train)

In [129]:
# optimal model's hyperparameters
search.best_params_

{'n_estimators': 200, 'max_features': 'sqrt', 'bootstrap': False}

In [130]:
assert search.best_score_ == search.cv_results_["mean_test_score"][search.best_index_]

In [132]:
# initialize Random Forest with the optimal hyperparameters
rfr = RandomForestRegressor(
    random_state=RANDOM_STATE, 
    **search.best_params_
)

# train it
rfr.fit(x_train, y_train)

In [133]:
y_pred = rfr.predict(x_test)
print("R^2 : ", r2_score(y_test, y_pred))
print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))

R^2 :  0.6906763703409575
MAE : 0.28956663842862546
RMSE: 0.3986321105808832


In [None]:
pd.DataFrame(y_pred, columns=['predicted_price']).to_csv(os.path.join(DATA_PATH, "predicted_price.csv"), index=False)

# Metrics

In [None]:
# code for metrics

## Test

### Accuracy, Precision, Recall, ROC AUC, Precision Recall AUC

In [None]:
# code

### Precision Recall Curve, AUC

In [None]:
# code 

### ROC Curve, ROC AUC

In [None]:
# code

## Inference

Compute metrics on the inference data

### Accuracy, Precision, Recall, Precision Recall AUC, ROC AUC

### Precision Recall Curve, AUC

### ROC Curve, ROC AUC

In [None]:
# code

# Population Stability Index (PSI)

**The population stability index (PSI)** is a statistic that measures how much a variable has shifted over time, and is used to monitor applicability of a statistical model to the current population.
Details by the [link](https://scholarworks.wmich.edu/cgi/viewcontent.cgi?article=4249&context=dissertations#:~:text=The%20population%20stability%20index%20(PSI,model%20to%20the%20current%20population.)


In [None]:
# code for psi