# Train Model

This notebook demonstrates how to train a simple linear regression model, using a synthetic dataset downloaded from cloud storage (AWS S3). It persists the trained model and its metrics locally, before uploading them to cloud storage for use elsewhere.

## Imports

In [1]:
import re
from datetime import date, datetime
from typing import Tuple

import boto3 as aws
import numpy as np
import pandas as pd
from botocore.exceptions import ClientError
from joblib import dump
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, max_error, r2_score
from sklearn.model_selection import train_test_split

## Load Dataset

Load all available data in an AWS S3 bucket. We start by defining an efficient helper function first.

In [2]:
def download_latest_dataset(aws_bucket: str) -> Tuple[pd.DataFrame, date]:
    """Get all available data from AWS S3 bucket.
    
    This function reads all CSV files from an AWS S3 bucket and then
    combines them into a single Pandas DataFrame object.
    """
    def _date_from_object_key(key: str) -> date:
        """Extract date from S3 file object key."""
        date_string = re.findall('20[2-9][0-9]-[0-1][0-9]-[0-3][0-9]', key)[0]
        file_date = datetime.strptime(date_string, '%Y-%m-%d').date()
        return file_date

    def _load_dataset_from_aws_s3(s3_obj_key: str) -> pd.DataFrame:
        """Load CSV datafile from AWS S3 into DataFrame."""
        object_data = s3_client.get_object(
            Bucket=aws_bucket,
            Key=s3_obj_key
        )
        return pd.read_csv(object_data['Body'])

        
    print(f'downloading all available training data from s3://{aws_bucket}/datasets')
    try:
        s3_client = aws.client('s3')
        s3_objects = s3_client.list_objects(Bucket=aws_bucket, Prefix='datasets/')
        object_keys_and_dates = [
            (obj['Key'], _date_from_object_key(obj['Key']))
            for obj in s3_objects['Contents']
        ]
        ordered_dataset_objs = sorted(object_keys_and_dates, key=lambda e: e[1])
        dataset = pd.concat(
            _load_dataset_from_aws_s3(obj_key[0])
            for obj_key in ordered_dataset_objs
        )
    except ClientError as e:
        print(f'failed to download training data from s3://{aws_bucket}/datasets')
    most_recent_date = object_keys_and_dates[-1][1]
    return (dataset, most_recent_date)


Applying `download_latest_dataset` to the project's S3 bucket.

In [3]:
data, data_date = download_latest_dataset('bodywork-ml-ops-project')
print(f'- most recent data added on {data_date}\n')
display(data)

downloading all available training data from s3://bodywork-ml-ops-project/datasets
- most recent data added on 2021-01-13



Unnamed: 0,y,X
0,40.815915,50.173483
1,75.930485,74.614141
2,38.367376,14.230011
3,73.691793,69.397528
4,55.421877,92.700290
...,...,...
1435,63.243651,65.926000
1436,36.271838,6.710064
1437,41.803066,27.505270
1438,45.139632,50.133207


## Define Task Metrics

This is a regression task, so we focus on:

* Mean Absolute Percentage Error (MAPE)
* R-Squared (R2)
* Maximum Residual

In [4]:
def model_metrics(y_actual, y_predicted) -> pd.DataFrame:
    """Return regression metrics record."""
    mape = mean_absolute_percentage_error(y_actual, y_predicted)
    r_squared = r2_score(y_actual, y_predicted)
    max_residual = max_error(y_actual, y_predicted)
    metrics_record = pd.DataFrame({
        'MAPE': [mape],
        'R2': [r_squared],
        'MR': [max_residual]
    })
    return metrics_record

## Split Data into Train and Test Subsets

We hold-out 20% of the data to use for testing the model.

In [5]:
X = data['X'].values.reshape(-1, 1)
y = data['y'].values

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

## Train Model and Compute Metrics

In [6]:
ols_regressor = LinearRegression(fit_intercept=True)
ols_regressor.fit(X_train, y_train)
metrics = model_metrics(y_test, ols_regressor.predict(X_test))

for k, v in metrics.to_dict().items():
    print(f'{k}: {v[0]:.2f}')

MAPE: 0.17
R2: 0.69
MR: 33.25


## Persist Model and Metrics

Upload artefacts to AWS S3.

In [7]:
def make_artefact_filenames(data_date: str) -> Tuple[str, str]:
    """Generate model and metrics filenames given date."""
    model_filename = f'regressor-{data_date}.joblib'
    metrics_filename = f'regressor-{data_date}.csv'
    return (model_filename, metrics_filename)


model_filename, metrics_filename = make_artefact_filenames(data_date)
dump(ols_regressor, model_filename)
metrics.to_csv(metrics_filename, header=True, index=False)

s3_bucket_name = 'bodywork-ml-ops-project'
s3_client = aws.client('s3')

s3_client.upload_file(
    model_filename,
    s3_bucket_name,
    f'models/{model_filename}'
)
print(f'uploaded {model_filename} to s3://{s3_bucket_name}/models/')

s3_client.upload_file(
    metrics_filename,
    s3_bucket_name,
    f'model-metrics/{metrics_filename}'
)
print(f'uploaded {metrics_filename} to s3://{s3_bucket_name}/model-metrics/')

uploaded regressor-2021-01-13.joblib to s3://bodywork-ml-ops-project/models/
uploaded regressor-2021-01-13.csv to s3://bodywork-ml-ops-project/model-metrics/
