In [3]:
import sys
import os
os.chdir("/Users/eliashadjammar/GitHub/dlrproject")

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse

import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [4]:
# Defining a handy function for later
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [5]:
raw_data = pd.read_csv("drive/aggregates/alldata_with_prices.csv")
raw_data.head()

Unnamed: 0,id,Land_Value,city_id,Neighborhood_FID,buildings_total_units,n_occupied_by_owner,n_ownership_with_current_household,n_owned_without_current_household,n_rented_for_residential_purposes,n_rented_with_current_household,...,Building_Type_town hall,"Building_Type_transmitting tower, radio tower",Building_Type_turbine house,Building_Type_vessel traffic buildings,Building_Type_vocational school,"Building_Type_warehouse, shed, warehouse",Building_Type_waste bunker,Building_Type_waste treatment building,Building_Type_water supply building,Building_Type_winding tower
0,1_1,6998.5,1,1,2478,0,0.055287,0.0,0,0.9205,...,,,,,,,,,,
1,1_2,11756.9,1,2,6706,0,0.071876,0.0,0,0.887116,...,,,,,,,,,,
2,1_3,5312.2,1,3,1834,0,0.043075,0.0,0,0.860414,...,,,,,,,,,,
3,1_4,9779.6,1,4,12784,0,0.137125,0.0,0,0.829161,...,,,,,,,,,,
4,1_5,4236.2,1,5,6788,0,0.151738,0.000442,0,0.800972,...,,,,,,,,,,


## Making the data usable
In its current state it's not possible to train a model on `raw_data`. There are two big issues: 1) there are more columns (644) than rows (421), and 2) there are a lot of missing values. In particular, a majority of the Building_Type columns is probably useless. On the other hand, some may not be.

I could replace all the NAs in Building_Type columns with zeroes. I'm not sure about it, but it's an option to keep in mind.
Another option would be to get rid of most of these columns, only keeping the ones with the most information content.

How about this: drop those columns which are missing for more than 20% of neighborhoods.
Then after that, drop rows with missing values and see how much is left.

In [6]:
data = pd.DataFrame([raw_data[col] for col in raw_data.columns if raw_data[col].isna().sum() < 80]).transpose()
data = data.dropna(axis=0)

data.head()


Unnamed: 0,id,Land_Value,city_id,Neighborhood_FID,buildings_total_units,n_occupied_by_owner,n_ownership_with_current_household,n_owned_without_current_household,n_rented_for_residential_purposes,n_rented_with_current_household,...,p,H,ent_ratio,buildings_count,mean_building_height,mean_sq_building_height,Building_Type_chapel,Building_Type_hospital,Building_Type_museum,Building_Type_stable
0,1_1,6998.5,1,1,2478,0,0.055287,0.0,0,0.9205,...,0.005821,0.044213,1.138809,657,16.745614,24.735354,1,0,0,0
1,1_2,11756.9,1,2,6706,0,0.071876,0.0,0,0.887116,...,0.013973,0.059533,1.172646,1195,22.711719,41.887857,0,0,0,0
2,1_3,5312.2,1,3,1834,0,0.043075,0.0,0,0.860414,...,0.004021,0.084311,1.23548,558,22.885135,36.416649,0,0,0,0
3,1_4,9779.6,1,4,12784,0,0.137125,0.0,0,0.829161,...,0.028112,0.036085,0.947872,2155,17.631818,34.856483,0,0,0,0
4,1_5,4236.2,1,5,6788,0,0.151738,0.000442,0,0.800972,...,0.013958,0.049935,0.975358,929,9.053763,9.786612,0,0,0,0


Actually, after getting rid of literally all BuildingType data except for chapels, hospitals, museums, and stables, we don't need to drop anything else anymore. And the segregation measures are also still there.

We still want to get rid of the id and neighborhood columns, as we don't need them here. We do want to keep the city_id, though, since including that is basically like region fixed effects.

In [7]:
clean_data = data.drop(["id", "Neighborhood_FID"], axis=1)
clean_data['city_id'] = clean_data['city_id'].astype('category')
citydummies = pd.get_dummies(clean_data['city_id'], prefix="city", prefix_sep="")
clean_data = pd.concat([citydummies, clean_data],axis=1)

## Splitting the data

In [8]:
# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(clean_data)

# The predicted column is "Land_Value" which is a scalar
train_x = train.drop(["Land_Value"], axis=1)
test_x = test.drop(["Land_Value"], axis=1)
train_y = train[["Land_Value"]]
test_y = test[["Land_Value"]]

In [9]:
# Scale the data
scaler = StandardScaler(with_mean=False)
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [9]:
# Set hyperparameters
kernel = "rbf"
degree = 3
gamma = "scale"
epsilon = 0.1
C = 1

In [11]:
with mlflow.start_run():
    model = SVR(kernel=kernel, degree=degree, gamma=gamma, epsilon=epsilon, C=C)

    # Train the model
    model.fit(train_x, train_y)

    # Test the model
    predicted_qualities = model.predict(test_x)

    # Compute metrics
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("SVR (kernel={:s}, degree={:f}), epsilon={:f}:".format(kernel, degree, epsilon))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_param("kernel", kernel)
    mlflow.log_param("degree", degree)
    mlflow.log_param("gamma", gamma)
    mlflow.log_param("epsilon", epsilon)
    mlflow.log_param("C", C)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":
        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(model, "model", registered_model_name="SVR")
    else:
        mlflow.sklearn.log_model(model, "model")

SVR (kernel=rbf, degree=3.000000), epsilon=0.100000:
  RMSE: 1272.4392622640512
  MAE: 548.0398789936489
  R2: -0.05078163219067755


  y = column_or_1d(y, warn=True)


## Feature selection

Since we have a huge load of continuous features and a continuous outcome, it's recommended to use either Pearson's or Spearman's correlation coefficients. That is, for each feature calculate the correlation with the outcome, and then rank the features by the correlation. Then pick the top k features.

However, I couldn't figure out how to do that, so we're going with what the sklearn docs suggest.

In [34]:
# Set hyperparameters
kernel = "rbf"
degree = 3
gamma = "scale"
epsilon = 0.1
C = 6
k_features = 20

In [32]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, r_regression

selector1 = VarianceThreshold(threshold=(.1))
selector2 = SelectKBest(r_regression, k=k_features)

train_x = np.array(train_x, dtype=float)
train_y = np.array(train_y, dtype=float)

train_x_selected1 = selector1.fit_transform(train_x)
train_x_selected = selector2.fit_transform(train_x_selected1, train_y)

  y = column_or_1d(y, warn=True)


In [33]:
with mlflow.start_run():
    model = SVR(kernel=kernel, degree=degree, gamma=gamma, epsilon=epsilon, C=C)

    # Train the model
    model.fit(train_x_selected, train_y)

    # Test the model
    test_x_selected1 = selector1.transform(test_x)
    test_x_selected = selector2.transform(test_x_selected1)
    predicted_qualities = model.predict(test_x_selected)

    # Compute metrics
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    print("SVR (kernel={:s}, degree={:f}), epsilon={:f}:".format(kernel, degree, epsilon))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    mlflow.log_param("k_features", k_features)
    mlflow.log_param("kernel", kernel)
    mlflow.log_param("degree", degree)
    mlflow.log_param("gamma", gamma)
    mlflow.log_param("epsilon", epsilon)
    mlflow.log_param("C", C)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":
        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(model, "model", registered_model_name="SVR")
    else:
        mlflow.sklearn.log_model(model, "model")

SVR (kernel=rbf, degree=3.000000), epsilon=0.100000:
  RMSE: 1192.2964430077384
  MAE: 488.05584667477814
  R2: 0.03496204335277364


  y = column_or_1d(y, warn=True)
