In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import pickle
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import mlflow
import os

In [None]:
def read_data_frame(file_path):
    df=pd.read_parquet(file_path)

    df['duration']=df.lpep_dropoff_datetime-df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda x:x.total_seconds()/60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    df[categorical] = df[categorical].astype(str)
    return df
    
    

In [None]:
df_train=read_data_frame('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_data_frame('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [None]:
df_train.info()

In [None]:
'''cat_col=df_train.select_dtypes(include='object')
   num_col=df_train.select_dtypes(exclude='object')'''

In [None]:
df_train

In [None]:
sns.distplot(df_train.duration)

In [None]:
df_train.duration

In [None]:
df_train.duration.describe()

In [None]:
df_train.duration.describe(percentiles=[.95,.98,.99])

In [None]:
cat_cols = ['PULocationID', 'DOLocationID']
num_cols = ['trip_distance']
train_dicts=df_train[cat_cols+num_cols].to_dict(orient='records')
dv=DictVectorizer()
x_train=dv.fit_transform(train_dicts)
target='duration'
y_train=df_train[target].values
lr=LinearRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_train)
mse=mean_squared_error(y_train, y_pred)
rmse=np.sqrt(mse)
rmse

In [None]:
sns.distplot(y_pred, label='Prediction')
sns.distplot(y_train, label='Actual')

plt.legend()

In [None]:
df_train['PU_DO']=df_train['PULocationID']+'_'+df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [None]:
cat_col=['PU_DO']
num_col=['trip_distance']
dv=DictVectorizer()
train_dicts=df_train[cat_col+num_col].to_dict(orient='records')
x_train=dv.fit_transform(train_dicts)
val_dicts = df_val[cat_col + num_col].to_dict(orient='records')
x_val = dv.transform(val_dicts)

In [None]:
target='duration'
y_train=df_train[target].values
y_val=df_val[target].values

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_val)
mse=mean_squared_error(y_val, y_pred)
rmse=np.sqrt(mse)
rmse

### Use Mlflow

In [None]:
os.makedirs('./Models',exist_ok=True)
with open('./Models/lin_rdge.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)
    

In [None]:
import os
print(os.getcwd())

In [None]:
mlflow.set_tracking_uri("http://localhost:5000") # tells mlflow where the tracking server
mlflow.set_experiment("Mlops_ZoomCamp") # # creates or selects an experiment to log runs (like training ,metrics,models)

In [None]:
with mlflow.start_run():
    mlflow.set_tag('developer','Ebrahim Emad')
    mlflow.log_param('train-data-path','https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
    mlflow.log_param('val-data-path','https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
    alpha=0.01
    mlflow.log_param('alpha',alpha)
    lr=Lasso(0.1)
    lr.fit(x_train,y_train)
    y_pred=lr.predict(x_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric('rmse',rmse)
    mlflow.log_artifact('./Models/lin_rdge.bin', artifact_path='models_pickle')

In [None]:
import mlflow
print(mlflow.__version__)

In [None]:
import joblib

# Load directly with joblib (not through MLflow)
model_path = "./02-experiment-tracking/mlflow_artifacts/1/2ee972101ffc40278a565d8d80990512/artifacts/models_pickle/lin_rdge.bin"
loaded_model = joblib.load(model_path)
loaded_model

In [None]:
y_pred = loaded_model[1].predict(x_val)
y_pred