In [44]:
import sys
import os
os.chdir("/Users/eliashadjammar/GitHub/dlrproject")

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [None]:
# Defining a handy function for later
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [None]:
raw_data = pd.read_csv("drive/aggregates/alldata_with_prices.csv")
raw_data.head()

## Making the data usable
In its current state it's not possible to train a model on `raw_data`. There are two big issues: 1) there are more columns (644) than rows (421), and 2) there are a lot of missing values. In particular, a majority of the Building_Type columns is probably useless. On the other hand, some may not be.

I could replace all the NAs in Building_Type columns with zeroes. I'm not sure about it, but it's an option to keep in mind.
Another option would be to get rid of most of these columns, only keeping the ones with the most information content.

How about this: drop those columns which are missing for more than 20% of neighborhoods.
Then after that, drop rows with missing values and see how much is left.

In [None]:
data = pd.DataFrame([raw_data[col] for col in raw_data.columns if raw_data[col].isna().sum() < 80]).transpose()
data = data.dropna(axis=0)

data.head()


Actually, after getting rid of literally all BuildingType data except for chapels, hospitals, museums, and stables, we don't need to drop anything else anymore. And the segregation measures are also still there.

We still want to get rid of the id and neighborhood columns, as we don't need them here. We do want to keep the city_id, though, since including that is basically like region fixed effects.

In [33]:
clean_data = data.drop(["id", "Neighborhood_FID"], axis=1)
clean_data['city_id'] = clean_data['city_id'].astype('category')
citydummies = pd.get_dummies(clean_data['city_id'], prefix="city", prefix_sep="")
clean_data = pd.concat([citydummies, clean_data],axis=1)

## Splitting the data

In [35]:
# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(clean_data)

# The predicted column is "Land_Value" which is a scalar
train_x = train.drop(["Land_Value"], axis=1)
test_x = test.drop(["Land_Value"], axis=1)
train_y = train[["Land_Value"]]
test_y = test[["Land_Value"]]

In [39]:
# Scale the data
scaler = StandardScaler(with_mean=False)
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [41]:
# Set hyperparameters
n_estimators = 100
max_depth = 40

In [45]:
with mlflow.start_run():
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

    # Train the model
    model.fit(train_x, train_y)

    # Test the model
    predicted_qualities = model.predict(test_x)

    # Compute metrics
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("Random Forest (n_estimators={:f}, max_depth={:f}):".format(n_estimators, max_depth))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":
        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(model, "model", registered_model_name="RandomForest")
    else:
        mlflow.sklearn.log_model(model, "model")

  model.fit(train_x, train_y)


Random Forest (n_estimators=100.000000, max_depth=40.000000):
  RMSE: 900.5850977648228
  MAE: 387.17586792452846
  R2: 0.6582924570696094
