First we need to train a model using mlflow as a tracking server and then log its artifact.

// Then we need to create a separate directory and bring in all the Pipfile, Pipfile.lock , predict.py, and test.py file

// We need to modify the predict.py file in order to load the model trained in mlflow

// Next we need to configure the aws credentials setting ```AWS_SECRET_ID``` and ```AWS_SECRET_ACCESS_KEY``` as environment variables by typing the following commands in the bash

```javascript                   
export AWS_SECRET_ID = Ay78***********
```
and
```javascript
export AWS_SECRET_ACCESS_KEY = *************
```

//Afterwards we need to start the mlflow server using the following commands

```cli
mflow server -h 0.0.0.0 --backend-store-uri sqlite:///mlflow.db --default-artifact-root= S3://<bucket name>
```

Then we should create an experiment for which the artifact store must be the same

```python
import mlflow
mlflow.create_experiment("exp_name",artifact_location="S3://<bucket_name>/path")
```

However while logging the model, if we execute the following code
```python
mlflow.sklearn.log_model(model_name, artifact_path="model")
```

The model will be stored in the location: ```S3://<bucket_name>/path/model```


Doing these steps enables us to retrieve the artifacts when required, without running the server locally, one can easily access the artifacts from s3 buckets provided they have the necessary credentials

------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------

In [4]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
import optuna


import os
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
data = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-11.parquet")


In [6]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-11-01 00:03:03,2023-11-01 01:04:08,2.0,13.6,1.0,N,132,26,2,61.8,2.75,0.5,0.0,0.0,1.0,66.05,0.0,1.75
1,1,2023-11-01 00:03:28,2023-11-01 00:23:59,0.0,3.5,1.0,N,140,7,1,20.5,3.5,0.5,5.1,0.0,1.0,30.6,2.5,0.0
2,2,2023-10-31 23:58:05,2023-11-01 00:54:03,4.0,18.61,2.0,N,132,230,1,70.0,0.0,0.5,16.54,6.94,1.0,99.23,2.5,1.75
3,2,2023-11-01 00:03:50,2023-11-01 00:04:59,1.0,0.39,1.0,N,236,236,1,4.4,1.0,0.5,1.88,0.0,1.0,11.28,2.5,0.0
4,2,2023-11-01 00:06:30,2023-11-01 00:14:25,1.0,1.2,1.0,N,236,141,1,10.0,1.0,0.5,3.0,0.0,1.0,18.0,2.5,0.0


In [7]:
data.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee'],
      dtype='object')

In [8]:
data.shape[0]*0.8

2671772.0

In [9]:
def data_preparation(data: pd.DataFrame) -> pd.DataFrame:
    data['duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']
    data['duration'] = data['duration'].apply(lambda td: td.total_seconds()/60)
    X = data[['passenger_count','trip_distance','PULocationID','DOLocationID','fare_amount']]
    y = data['duration']
    return X,y

In [10]:
data.shape[0]

3339715

In [11]:
X_train_val, y_train_val = data_preparation(data[:20000])
X_test, y_test = data_preparation(data[20001:25000])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['duration'] = data['duration'].apply(lambda td: td.total_seconds()/60)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['duration'] = data['tpep_dropoff_datetime'] 

In [12]:
X_train,X_val,y_train,y_val = train_test_split(X_train_val,y_train_val, test_size=0.3,random_state=44,shuffle=True)

In [13]:
xtr = X_train.to_dict(orient='records')
xval = X_val.to_dict(orient='records')

In [14]:
dv = DictVectorizer()


In [15]:
train_data = dv.fit_transform(xtr)
val_data = dv.transform(xval)

## Create and optuna hyperparameter tuner


In [16]:
RandomForestRegressor??

[0;31mInit signature:[0m
[0mRandomForestRegressor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'squared_error'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,

In [17]:

def objective(trial):
    param = {
        "n_estimators": trial.suggest_int('n_estimators',100,200,50),
        "max_depth": trial.suggest_int('max_depth',10,50,5),
    }

    rf_reg = RandomForestRegressor(**param)
    rf_reg.fit(X_train,y_train)
    preds = rf_reg.predict(X_val)
    accuracy = mean_squared_error(y_pred=preds, y_true=y_val,squared=False)
    return accuracy


sampler = optuna.samplers.TPESampler()
study = optuna.create_study(direction="minimize",sampler=sampler)

n_trials = 10
study.optimize(objective,n_trials=n_trials)
    

[I 2024-02-10 20:10:26,339] A new study created in memory with name: no-name-18ec57df-e26a-4b8e-b50b-a09efbfc780b


  "n_estimators": trial.suggest_int('n_estimators',100,200,50),
  "max_depth": trial.suggest_int('max_depth',10,50,5),
[I 2024-02-10 20:10:36,763] Trial 0 finished with value: 43.22374033359256 and parameters: {'n_estimators': 200, 'max_depth': 30}. Best is trial 0 with value: 43.22374033359256.
  "n_estimators": trial.suggest_int('n_estimators',100,200,50),
  "max_depth": trial.suggest_int('max_depth',10,50,5),
[I 2024-02-10 20:10:45,431] Trial 1 finished with value: 43.23545838768818 and parameters: {'n_estimators': 200, 'max_depth': 15}. Best is trial 0 with value: 43.22374033359256.
  "n_estimators": trial.suggest_int('n_estimators',100,200,50),
  "max_depth": trial.suggest_int('max_depth',10,50,5),
[I 2024-02-10 20:10:50,148] Trial 2 finished with value: 42.96906540137565 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 2 with value: 42.96906540137565.
  "n_estimators": trial.suggest_int('n_estimators',100,200,50),
  "max_depth": trial.suggest_int('max_depth',

In [18]:
best_params = study.best_params

In [19]:
rf_reg = RandomForestRegressor(**best_params)

Your create this experiment only once

In [21]:
mlflow.create_experiment("nyc_taxi_exp_4.2_deploy_2",artifact_location="s3://mlflow-artifacts-remote-11/mlflow_artifacts/")

'12'

In [22]:
from mlflow.data.pandas_dataset import PandasDataset
dataset: PandasDataset = mlflow.data.from_pandas(X_train_val[:5])

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


#pickle the dv to log it as an artifact
```cpp
import pickle

with open('dv.bin','wb') as f_out:
    pickle.dump(dv,f_out)
```

In [31]:

mlflow.set_experiment("nyc_taxi_exp_4.2_deploy_2")

mlflow.set_tracking_uri("http://localhost:5000")

with mlflow.start_run():
    
    mlflow.log_params(best_params)
    rf_reg.fit(X_train,y_train)
    y_pred = rf_reg.predict(X_val)

    rmse = mean_squared_error(y_val,y_pred,squared=False)
    mlflow.log_metric("rmse",rmse)

    mlflow.log_input(dataset,'train_data')

    mlflow.log_artifact('dv.bin')
    mlflow.sklearn.log_model(rf_reg,artifact_path="model")



