## Imports

In [88]:
import mlflow
import numpy as np
import pandas as pd

from memoized_property import memoized_property

from  mlflow.tracking import MlflowClient

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline

# Data

## Taxifare

In [1]:
# def get_data():
#     url = 's3://wagon-public-datasets/taxi-fare-train.csv'
#     data = pd.read_csv(url, nrows = 100)

#     return data

In [4]:
data = get_data()
data.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


## Clean data

In [10]:
# def clean_data(data):
#     # Removing any rows containing NaN
#     data = data.dropna(axis = 'rows')

#     # Keeping lat/lon that are not zero
#     condition = (data.dropoff_latitude != 0) | (data.dropoff_longitude != 0)
#     data = data[condition]

#     condition = (data.pickup_latitude != 0) | (data.pickup_longitude != 0)
#     data = data[condition]

#     # If fare_amount is in the columns, keep only the ones
#     # between 0 and 4000
#     if 'fare_amount' in list(data):
#         condition = data.fare_amount.between(0, 4000)
#         data = data[condition]
        
#     # Keep only rows where passenger_count is strictly below 8
#     # and above 8
#     condition = data.passenger_count < 8
#     data = data[condition]

#     condition = data.passenger_count >= 1
#     data = data[condition]

#     # Limiting the coordinates
#     condition = data['pickup_latitude'].between(left = 40, right = 42)
#     data = data[condition]

#     condition = data['pickup_longitude'].between(left = -74.3, right = -72.9)
#     data = data[condition]

#     condition = data['dropoff_latitude'].between(left = 40, right = 42)
#     data = data[condition]

#     condition = data['dropoff_longitude'].between(left = -74, right = -72.9)
#     data = data[condition]

#     return data

In [11]:
data.shape

(100, 8)

In [12]:
data = clean_data(data)

In [13]:
data.shape

(84, 8)

# Holdout

In [15]:
# X_train = data.drop('fare_amount', axis = 1)
# y_train = data['fare_amount']

# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = .1)

# Model

In [70]:
# from sklearn.ensemble import RandomForestRegressor

# model_params = dict(
#   n_estimators = 100,
#   max_depth = 1
# )

model_params = {
  'n_estimators': 100,
  'max_depth': 1
}

model = RandomForestRegressor()
model.set_params(**model_params)

RandomForestRegressor(max_depth=1)

# Transformers

In [71]:
def minkowski_distance(
    data, p, start_lat = 'pickup_latitude', start_lon = 'pickup_longitude', end_lat = 'dropoff_latitude', end_lon = 'dropoff_longitude'
):
    x1 = data[start_lon]
    x2 = data[end_lon]

    y1 = data[start_lat]
    y2 = data[end_lat]
    
    minkowski = ((abs(x2 - x1) ** p) + (abs(y2 - y1)) ** p) ** (1 / p)

    return minkowski

In [72]:
class DistanceTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, distance_type = 'euclidian', **kwargs):
        self.distance_type = distance_type

    def transform(self, X, y = None):
        # Guard clause
        assert isinstance(X, pd.DataFrame)
        
        # Handling two different types of distance
        if self.distance_type == 'euclidian':
            X['distance'] = minkowski_distance(X, p = 2)

        if self.distance_type == 'manhattan':
            X['distance'] = minkowski_distance(X, p = 1)

        return X[['distance']]

    def fit(self, X, y = None):
        return self

# Pipeline

In [79]:
pipe_distance = make_pipeline(
    DistanceTransformer(),
    StandardScaler()
)


columns = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

feateng_blocks = [
    ('distance', pipe_distance, columns),
]

features_encoder = ColumnTransformer(feateng_blocks)

pipeline = Pipeline(
    steps=[
            ('features', features_encoder),
            ('model', model)
    ]
)

In [80]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('distance',
                                                  Pipeline(steps=[('distancetransformer',
                                                                   DistanceTransformer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['pickup_latitude',
                                                   'pickup_longitude',
                                                   'dropoff_latitude',
                                                   'dropoff_longitude'])])),
                ('model', RandomForestRegressor(max_depth=1))])

# Metrics

In [81]:
def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true) ** 2).mean())

In [82]:
y_pred = pipeline.predict(X_test)

In [83]:
rmse = compute_rmse(y_pred, y_test)

In [84]:
rmse

13.758602977195064

# MLflow

In [86]:
class MLFlowBase():

    def __init__(self, experiment_name, MLFLOW_URI):
        self.experiment_name = experiment_name
        self.MLFLOW_URI = MLFLOW_URI

    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(self.MLFLOW_URI)

        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client.create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client.get_experiment_by_name(self.experiment_name).experiment_id

    def mlflow_create_run(self):
        self.mlflow_run = self.mlflow_client.create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client.log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client.log_metric(self.mlflow_run.info.run_id, key, value)

# Trainer parameters

In [87]:
pipeline.get_params()

{'memory': None,
 'steps': [('features',
   ColumnTransformer(transformers=[('distance',
                                    Pipeline(steps=[('distancetransformer',
                                                     DistanceTransformer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['pickup_latitude', 'pickup_longitude',
                                     'dropoff_latitude', 'dropoff_longitude'])])),
  ('model', RandomForestRegressor(max_depth=1))],
 'verbose': False,
 'features': ColumnTransformer(transformers=[('distance',
                                  Pipeline(steps=[('distancetransformer',
                                                   DistanceTransformer()),
                                                  ('standardscaler',
                                                   StandardScaler())]),
                                  ['p

# Gridsearch

In [89]:
grid_search = GridSearchCV(
    pipeline,
    param_grid = {
        'features__distance__standardscaler__copy': [True],
        'model__min_samples_leaf': [3],
        'model__oob_score': [True],
        'model__min_weight_fraction_leaf': [0.0, 0.1]
    },
    cv = 5
)

grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)

grid_search.best_estimator_
grid_search.best_params_

{'features__distance__standardscaler__copy': True,
 'model__min_samples_leaf': 3,
 'model__min_weight_fraction_leaf': 0.0,
 'model__oob_score': True}