In [1]:
import argparse

from pathlib import Path
import os
import numpy as np
import pandas as pd

import mlflow

In [2]:
# Define Arguments for this step

class MyArguments:
    def __init__(self, /, **kwargs):
        self.__dict__.update(kwargs)

args = MyArguments(
                raw_data = "../../data/", 
                prepared_data = "/tmp/prep"
                )

os.makedirs(args.prepared_data, exist_ok = True)

In [3]:
def main(args):
    '''Read, split, and save datasets'''

    # ------------ Reading Data ------------ #
    # -------------------------------------- #

    print("mounted_path files: ")
    arr = os.listdir(args.raw_data)
    print(arr)

    data = pd.read_csv((Path(args.raw_data) / 'taxi-data.csv'))

    # ------------- Split Data ------------- #
    # -------------------------------------- #

    # Split data into train, val and test datasets

    random_data = np.random.rand(len(data))

    msk_train = random_data < 0.7
    msk_val = (random_data >= 0.7) & (random_data < 0.85)
    msk_test = random_data >= 0.85

    train = data[msk_train]
    val = data[msk_val]
    test = data[msk_test]

    mlflow.log_metric('train size', train.shape[0])
    mlflow.log_metric('val size', val.shape[0])
    mlflow.log_metric('test size', test.shape[0])

    train.to_csv((Path(args.prepared_data) / "train.csv"))
    val.to_csv((Path(args.prepared_data) / "val.csv"))
    test.to_csv((Path(args.prepared_data) / "test.csv"))


In [4]:
mlflow.start_run()

lines = [
    f"Raw data path: {args.raw_data}",
    f"Data output path: {args.prepared_data}",
]

for line in lines:
    print(line)

main(args)

mlflow.end_run()

Raw data path: ../../data/
Data output path: /tmp/prep
mounted_path files: 
['.amlignore', '.amlignore.amltmp', 'taxi-batch.csv', 'taxi-data.csv', 'taxi-request.json']


In [6]:
!ls /tmp/prep

test.csv  train.csv  val.csv
