In [2]:
!python -V

Python 3.9.12


In [3]:
import pandas as pd 
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [4]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Mlflow_cooking")

2025/09/19 17:57:44 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/09/19 17:57:44 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/workspaces/MLops-cookin-/02-MLflow/mlruns/1', creation_time=1758188410351, experiment_id='1', last_update_time=1758188410351, lifecycle_stage='active', name='Mlflow_cooking', tags={}>

In [5]:
mlflow.__version__

'3.1.4'

In [6]:
# making it a function for easy access
def read_dataframe(filename):
    df=pd.read_parquet(filename)

    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

    df['duration']=df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df=df[(df.duration >= 1.0) & (df.duration <= 62)]

    categorical=['PULocationID','DOLocationID']
    #numerical=['trip_distance']

    df[categorical] = df[categorical].astype(str)
    
    return df

In [7]:
df_train=read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
df_val=read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")

In [8]:
target="duration"
y_train=df_train[target].values
y_val=df_val[target].values

In [9]:
# trying the feature combination 
df_train["PU_OD"]=df_train["PULocationID"] + '_' + df_train['DOLocationID']
df_val["PU_OD"]=df_val["PULocationID"] + '_' + df_val['DOLocationID']

In [10]:
categorical=["PU_OD"]#'PULocationID','DOLocationID']
numerical=['trip_distance']

dv=DictVectorizer()

train_dicts=df_train[categorical + numerical].to_dict(orient="records")
X_train=dv.fit_transform(train_dicts)

val_dicts=df_val[categorical + numerical].to_dict(orient="records")
X_val=dv.transform(val_dicts)

In [11]:
# trying this model to xgboost
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [12]:
'''with open("models/preprocessor.b", 'wb') as f_out:
    pickle.dump(dv, f_out)'''

'with open("models/preprocessor.b", \'wb\') as f_out:\n    pickle.dump(dv, f_out)'

In [20]:
train= xgb.DMatrix(X_train, label=y_train)
valid=xgb.DMatrix(X_val, label=y_val)

In [13]:
with mlflow.start_run():
    
    train= xgb.DMatrix(X_train, label=y_train)
    valid=xgb.DMatrix(X_val, label=y_val)
    
    
    best_params={
        "learning_rate": 0.14988312150619953,
        "max_depth":66,
        "min_child_weight":1.0531411801474737,
        "objective":"reg:squarederror",
        "reg_alpha":0.01980913796072851,
        "reg_lambda":0.042524169343261004,
        "seed":13}
    #saving the best params in mlflow
    mlflow.log_params(best_params)
    
    booster=xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=5,
            evals=[(valid, "validation")],
            early_stopping_rounds=50)
    
    y_pred = booster.predict(valid)
    rmse=mean_squared_error(y_val, y_pred, squared=False)
    #saving the metrics score in mlflow
    mlflow.log_metric("rmse",rmse)
    
    #saving the preprocessing data in mlflow
    #mlflow.log_artifact("models/preprocessor.b", artifact_path='preprocessor')
    
    #saving the model in mlflow
    mlflow.xgboost.log_model(xgb_model=booster, name="xgboost",
                            code_paths=['models/'])




'with mlflow.start_run():\n    \n    train= xgb.DMatrix(X_train, label=y_train)\n    valid=xgb.DMatrix(X_val, label=y_val)\n    \n    \n    best_params={\n        "learning_rate": 0.14988312150619953,\n        "max_depth":66,\n        "min_child_weight":1.0531411801474737,\n        "objective":"reg:squarederror",\n        "reg_alpha":0.01980913796072851,\n        "reg_lambda":0.042524169343261004,\n        "seed":13}\n    #saving the best params in mlflow\n    mlflow.log_params(best_params)\n    \n    booster=xgb.train(\n            params=best_params,\n            dtrain=train,\n            num_boost_round=5,\n            evals=[(valid, "validation")],\n            early_stopping_rounds=50)\n    \n    y_pred = booster.predict(valid)\n    rmse=mean_squared_error(y_val, y_pred, squared=False)\n    #saving the metrics score in mlflow\n    mlflow.log_metric("rmse",rmse)\n    \n    #saving the preprocessing data in mlflow\n    #mlflow.log_artifact("models/preprocessor.b", artifact_path=\'p

In [14]:
xgb.__version__

'2.1.4'

In [17]:
best_params={
        "learning_rate": 0.14988312150619953,
        "max_depth":66,
        "min_child_weight":1.0531411801474737,
        "objective":"reg:squarederror",
        "reg_alpha":0.01980913796072851,
        "reg_lambda":0.042524169343261004,
        "seed":13}