In [5]:
from pathlib import Path
import os
import pickle

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import mlflow
import mlflow.sklearn

In [6]:
prepared_data = "/tmp/prep"
model_output = "/tmp/train"
os.makedirs(model_output, exist_ok = True)

regressor__n_estimators = 500
regressor__bootstrap = 1
regressor__max_depth = 10
regressor__max_features = "auto" 
regressor__min_samples_leaf = 4
regressor__min_samples_split = 5

In [7]:
TARGET_COL = "cost"

NUMERIC_COLS = [
    "distance",
    "dropoff_latitude",
    "dropoff_longitude",
    "passengers",
    "pickup_latitude",
    "pickup_longitude",
    "pickup_weekday",
    "pickup_month",
    "pickup_monthday",
    "pickup_hour",
    "pickup_minute",
    "pickup_second",
    "dropoff_weekday",
    "dropoff_month",
    "dropoff_monthday",
    "dropoff_hour",
    "dropoff_minute",
    "dropoff_second",
]

CAT_NOM_COLS = [
    "store_forward",
    "vendor",
]

CAT_ORD_COLS = [
]

def main():
    
    lines = [
        f"Training data path: {prepared_data}",
        f"Model output path: {model_output}",
    ]

    for line in lines:
        print(line)

    print("mounted_path files: ")
    arr = os.listdir(prepared_data)
    print(arr)

    train_data = pd.read_csv((Path(prepared_data) / "train.csv"))

    # Split the data into input(X) and output(y)
    y_train = train_data[TARGET_COL]
    X_train = train_data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS]
    # Train a Linear Regression Model with the train set

    # numerical features
    numeric_transformer = Pipeline(steps=[
        ('standardscaler', StandardScaler())])

    # ordinal features transformer
    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ('minmaxscaler', MinMaxScaler())
    ])

    # nominal features transformer
    nominal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ('onehot', OneHotEncoder(sparse=False))
    ])

    # imputer only for all other features
    imputer_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent"))
    ])

    # preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', numeric_transformer, NUMERIC_COLS),
           #('ordinal', ordinal_transformer, CAT_ORD_COLS),
            ('nominal', nominal_transformer, CAT_NOM_COLS)], # other features are already binary
            remainder="drop")

    # append regressor to preprocessing pipeline.
    # now we have a full prediction pipeline.
    
    #model = Pipeline(steps=[('preprocessor', preprocessor),
    #                      ('regressor', RandomForestRegressor(
    #                          n_estimators = regressor__n_estimators,
    #                          bootstrap = regressor__bootstrap,
    #                          max_depth = regressor__max_depth,
    #                          max_features = regressor__max_features,
    #                          min_samples_leaf = regressor__min_samples_leaf,
    #                          min_samples_split = regressor__min_samples_split,
    #                          random_state=0))])


    model = RandomForestRegressor(n_estimators = regressor__n_estimators,
                                  bootstrap = regressor__bootstrap,
                                  max_depth = regressor__max_depth,
                                  max_features = regressor__max_features,
                                  min_samples_leaf = regressor__min_samples_leaf,
                                  min_samples_split = regressor__min_samples_split,
                                  random_state=0)

    mlflow.log_param("model", "RandomForestRegressor")
    mlflow.log_param("n_estimators", regressor__n_estimators)
    mlflow.log_param("bootstrap", regressor__bootstrap)
    mlflow.log_param("max_depth", regressor__max_depth)
    mlflow.log_param("max_features", regressor__max_features)
    mlflow.log_param("min_samples_leaf", regressor__min_samples_leaf)
    mlflow.log_param("min_samples_split", regressor__min_samples_split)

    model.fit(X_train, y_train)

    # Predict using the Regression Model
    yhat_train = model.predict(X_train)

    # Evaluate Regression performance with the train set
    r2 = r2_score(y_train, yhat_train)
    mse = mean_squared_error(y_train, yhat_train)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_train, yhat_train)

    mlflow.log_metric("train r2", r2)
    mlflow.log_metric("train mse", mse)
    mlflow.log_metric("train rmse", rmse)
    mlflow.log_metric("train mae", mae)

    # Visualize results
    plt.scatter(y_train, yhat_train,  color='black')
    plt.plot(y_train, y_train, color='blue', linewidth=3)
    plt.xlabel("Real value")
    plt.ylabel("Predicted value")
    plt.savefig("/tmp/train/regression_results.png")
    mlflow.log_artifact("/tmp/train/regression_results.png")

    # Save the model
    pickle.dump(model, open((Path(model_output) / "model.pkl"), "wb"))

if __name__ == "__main__":
    main()
    

Training data path: /tmp/prep
Model output path: /tmp/train
mounted_path files: 
['train.csv', 'test.csv', 'val.csv']
    distance  dropoff_latitude  dropoff_longitude  passengers  \
0       1.80         40.678741         -73.980309           1   
1       0.50         40.754715         -73.925499           1   
2       0.90         40.669662         -73.911041           1   
3       2.72         40.774963         -73.892372           1   
4       6.83         40.756031         -73.945351           1   
..       ...               ...                ...         ...   
95      4.36         40.761974         -73.922142           1   
96      0.90         40.757500         -73.882881           1   
97      8.30         40.751842         -73.980522           1   
98      0.53         40.808556         -73.959656           1   
99      0.78         40.815735         -73.938713           1   

    pickup_latitude  pickup_longitude  pickup_weekday  pickup_month  \
0         40.679798        -73

AzureMLException: AzureMLException:
	Message: UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.329b4ea3-a8ae-44e9-a762-6a3540092d93/regression_results.png already exists.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "UserError: Resource Conflict: ArtifactId ExperimentRun/dcid.329b4ea3-a8ae-44e9-a762-6a3540092d93/regression_results.png already exists."
    }
}

In [None]:
!ls /tmp/train