In [16]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import mlflow
from sklearn.model_selection import train_test_split
import os

import urllib.request
import zipfile
from joblib import load, dump

In [61]:
# os.environ["AWS_PROFILE"] = "user"

mlflow.set_tracking_uri('http://ec2-3-76-206-134.eu-central-1.compute.amazonaws.com:5000')
mlflow.set_registry_uri('http://ec2-3-76-206-134.eu-central-1.compute.amazonaws.com:5000')
mlflow.set_experiment('bike-customer-segment-prediction')
print(f"artifact_uri: '{mlflow.get_artifact_uri()}', tracking_uri: '{mlflow.get_tracking_uri()}'")

artifact_uri: 's3://mlflow-artifacts-data-store/mlflow/6/ffcc450329904717aa01d20669349e07/artifacts', tracking_uri: 'http://ec2-3-76-206-134.eu-central-1.compute.amazonaws.com:5000'


In [62]:
def read_dataframe(file_name: str):
    df = pd.read_parquet(file_name)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td:td.total_seconds() / 60 )

    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

def download_file(file_url):
    filehandle, _ = urllib.request.urlretrieve(file_url)

    zip_file_object = zipfile.ZipFile(filehandle, 'r')
    with zip_file_object as zip_ref:
        zip_ref.extractall('./data/')
    first_file = zip_file_object.namelist()[0]
    print(f"{first_file} Downloaded successfully...")

    return first_file

def process_data(df: pd.DataFrame):
    df['user_type'].replace({'Subscriber': 1, 'Customer': 0}, inplace=True)
    df[categorical] = df[categorical].astype(str)

    return df

def load_data(zip_file_url):
    file_name = download_file(zip_file_url)
    df = pd.read_csv(f"./data/{file_name}")
    print(f"{file_name} loaded successfully.")
    return process_data(df)


categorical = ['start_station_id', 'end_station_id', 'bike_id', 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude']
numerical = ['duration_sec']


df_train = load_data('https://s3.amazonaws.com/baywheels-data/201801-fordgobike-tripdata.csv.zip')
df_val = load_data('https://s3.amazonaws.com/baywheels-data/201802-fordgobike-tripdata.csv.zip')
df_train.describe()
# df = df.sample(n=1000000, random_state=42)
# df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)
# df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
# df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

201801-fordgobike-tripdata.csv Downloaded successfully...
201801-fordgobike-tripdata.csv loaded successfully.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['user_type'].replace({'Subscriber': 1, 'Customer': 0}, inplace=True)
  df['user_type'].replace({'Subscriber': 1, 'Customer': 0}, inplace=True)


201802-fordgobike-tripdata.csv Downloaded successfully...
201802-fordgobike-tripdata.csv loaded successfully.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['user_type'].replace({'Subscriber': 1, 'Customer': 0}, inplace=True)
  df['user_type'].replace({'Subscriber': 1, 'Customer': 0}, inplace=True)


Unnamed: 0,duration_sec,user_type
count,94802.0,94802.0
mean,870.93593,0.869792
std,2550.596891,0.336534
min,61.0,0.0
25%,359.0,1.0
50%,555.0,1.0
75%,854.0,1.0
max,85546.0,1.0


In [63]:
df_train.head()

Unnamed: 0,duration_sec,start_time,end_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,bike_share_for_all_trip
0,75284,2018-01-31 22:52:35.2390,2018-02-01 19:47:19.8240,120,Mission Dolores Park,37.7614205,-122.4264353,285,Webster St at O'Farrell St,37.7835208353,-122.4311578274,2765,1,No
1,85422,2018-01-31 16:13:34.3510,2018-02-01 15:57:17.3100,15,San Francisco Ferry Building (Harry Bridges Pl...,37.795392,-122.394203,15,San Francisco Ferry Building (Harry Bridges Pl...,37.795392,-122.394203,2815,0,No
2,71576,2018-01-31 14:23:55.8890,2018-02-01 10:16:52.1160,304,Jackson St at 5th St,37.3487586867,-121.8947978318,296,5th St at Virginia St,37.3259984,-121.87712,3039,0,No
3,61076,2018-01-31 14:53:23.5620,2018-02-01 07:51:20.5000,75,Market St at Franklin St,37.7737932061,-122.4212390184,47,4th St at Harrison St,37.7809545996,-122.3997491598,321,0,No
4,39966,2018-01-31 19:52:24.6670,2018-02-01 06:58:31.0530,74,Laguna St at Hayes St,37.7764348192,-122.4262440205,19,Post St at Kearny St,37.788975,-122.403452,617,1,No


In [66]:
import sklearn as sk
from sklearn.pipeline import make_pipeline
import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [67]:
def prepare_dictionaries(df: pd.DataFrame):
    # df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    # categorical = ['PU_DO']
    # numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [68]:
train_dicts = prepare_dictionaries(df_train)
val_dicts = prepare_dictionaries(df_val)

In [69]:
target = 'user_type'
y_train = df_train[target].values
y_val = df_val[target].values

## Training the model

In [72]:
mlflow.end_run()
with mlflow.start_run():
    mlflow.set_tag('model', 'linear_regression')
    mlflow.log_param('train_data_path','https://s3.amazonaws.com/baywheels-data/201801-fordgobike-tripdata.csv.zip')
    mlflow.log_param('validation_data_path','https://s3.amazonaws.com/baywheels-data/201802-fordgobike-tripdata.csv.zip')
    
    lin_pipeline = make_pipeline(
        DictVectorizer(),
        LinearRegression()
    )
    lin_pipeline.fit(train_dicts, y_train)
    y_pred = lin_pipeline.predict(val_dicts)
    
    rmse = mean_squared_error(y_pred, y_val, squared=False)
    mlflow.sklearn.log_model(lin_pipeline, artifact_path="model")
    # mlflow.sklearn.autolog()
    mlflow.log_metric('rmse', rmse)

2024/08/19 22:45:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run hilarious-eel-295 at: http://ec2-3-76-206-134.eu-central-1.compute.amazonaws.com:5000/#/experiments/6/runs/b70f08655f264443b2f9269c013ee2d7.
2024/08/19 22:45:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://ec2-3-76-206-134.eu-central-1.compute.amazonaws.com:5000/#/experiments/6.


ProfileNotFound: The config profile (user) could not be found