In [1]:
# I/O
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import geopandas as gpd
import requests
from io import BytesIO
from zipfile import ZipFile
from datetime import datetime, timedelta

In [2]:
# URL of the zipped data file
urls = ['https://s3.amazonaws.com/capitalbikeshare-data/202401-capitalbikeshare-tripdata.zip',
        'https://s3.amazonaws.com/capitalbikeshare-data/202402-capitalbikeshare-tripdata.zip', 
       'https://s3.amazonaws.com/capitalbikeshare-data/202403-capitalbikeshare-tripdata.zip']

In [3]:
# List to store DataFrames
dataframes = []
# Loop through each URL
for url in urls:
    # Download the zipped file
    response = requests.get(url)
    zipfile = ZipFile(BytesIO(response.content))
    
    # List files in the zip file
    file_name = zipfile.namelist()[0]
    
    # Read the desired file into a pandas DataFrame
    with zipfile.open(file_name) as file:
        combined_df = pd.read_csv(file)
        dataframes.append(combined_df)

# Concatenate all DataFrames into one
combined_combined_df = pd.concat(dataframes, ignore_index=True)

In [4]:
# Display the DataFrame
combined_combined_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,748A93D7DE8A41CD,classic_bike,2024-01-25 15:49:59,2024-01-25 15:52:35,1st & O St NW,31519.0,1st & L St NW,31677.0,38.908643,-77.012365,38.903819,-77.011987,member
1,75CBFD136F06305B,classic_bike,2024-01-02 16:44:58,2024-01-02 16:53:25,1st & O St NW,31519.0,4th & College St NW,31138.0,38.908643,-77.012365,38.921233,-77.018135,member
2,0536C9720F87E04C,classic_bike,2024-01-24 15:40:15,2024-01-24 15:43:55,1st & O St NW,31519.0,1st & L St NW,31677.0,38.908643,-77.012365,38.903819,-77.011987,member
3,9E17390C218783B5,classic_bike,2024-01-04 15:35:00,2024-01-04 15:37:35,1st & O St NW,31519.0,1st & L St NW,31677.0,38.908643,-77.012365,38.903819,-77.011987,member
4,00727D0E773CDFF7,electric_bike,2024-01-05 12:27:58,2024-01-05 12:35:40,1st & O St NW,31519.0,10th & G St NW,31274.0,38.90869,-77.012317,38.898243,-77.026235,casual


# Exploratory Data Analysis  

Let's start creating new variables within our dataset and taking a look at its structure. 

In [5]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436946 entries, 0 to 436945
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             436946 non-null  object 
 1   rideable_type       436946 non-null  object 
 2   started_at          436946 non-null  object 
 3   ended_at            436946 non-null  object 
 4   start_station_name  371894 non-null  object 
 5   start_station_id    371894 non-null  float64
 6   end_station_name    368605 non-null  object 
 7   end_station_id      368500 non-null  float64
 8   start_lat           436946 non-null  float64
 9   start_lng           436946 non-null  float64
 10  end_lat             436558 non-null  float64
 11  end_lng             436558 non-null  float64
 12  member_casual       436946 non-null  object 
dtypes: float64(6), object(7)
memory usage: 43.3+ MB


Start creating variables: 
- Start and End Dates
- Start and End Times
- Convert Station IDs to Character Integers
- Create new geocombined_dfs that have start and end locations so that we can create the neighborhoods that they started and ended in
- 

In [6]:
# Adding time_of_day
def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

federal_holidays_2024 = [
    '2024-01-01',  # New Year's Day
    '2024-01-15',  # Martin Luther King Jr. Day
    '2024-02-19',  # Presidents' Day
    '2024-05-27',  # Memorial Day
    '2024-06-19',  # Juneteenth National Independence Day
    '2024-07-04',  # Independence Day
    '2024-09-02',  # Labor Day
    '2024-10-14',  # Columbus Day
    '2024-11-11',  # Veterans Day
    '2024-11-28',  # Thanksgiving Day
    '2024-12-25'   # Christmas Day
]

# Adding holiday_weekend
def is_holiday_weekend(date):
    dt = pd.to_datetime(date)
    if dt.strftime('%Y-%m-%d') in federal_holidays_2024 and dt.weekday() in [1, 2, 3]:  # Tuesday, Wednesday, Thursday
        return True
    return False

In [87]:
combined_df["started_at"]= pd.to_datetime(combined_df["started_at"])
combined_df["started_at_date"]= pd.to_datetime(combined_df["started_at"]).dt.date
combined_df["started_at_time"]= pd.to_datetime(combined_df["started_at"]).dt.time
combined_df["ended_at"]= pd.to_datetime(combined_df["ended_at"])
combined_df["ended_at_date"]= pd.to_datetime(combined_df["ended_at"]).dt.date
combined_df["ended_at_time"]= pd.to_datetime(combined_df["ended_at"]).dt.time
combined_df["start_station_id"] = combined_df["start_station_id"].astype(str)
combined_df["end_station_id"] = combined_df["end_station_id"].astype(str)
combined_df["duration"] = pd.to_datetime(combined_df["ended_at"]) - pd.to_datetime(combined_df["started_at"])
combined_df["duration_minutes"] = combined_df.duration.apply(lambda td: td.total_seconds() / 60)
combined_df['time_of_day'] = combined_df['started_at'].apply(lambda x: get_time_of_day(pd.to_datetime(x).hour))
combined_df['day_of_week'] = combined_df['started_at'].apply(lambda x: pd.to_datetime(x).strftime('%A'))
# Adding day_type
combined_df['day_type'] = combined_df['day_of_week'].apply(lambda x: 'weekend' if x in ['Saturday', 'Sunday'] else 'work-day')
# Adding falls_on_holiday
combined_df['falls_on_holiday'] = combined_df['started_at'].apply(lambda x: pd.to_datetime(x).strftime('%Y-%m-%d') in federal_holidays_2024)
combined_df['holiday_weekend'] = combined_df['started_at'].apply(lambda x: is_holiday_weekend(x))
# Filter dataframe to only have rides between 30 seconds and 2 hours
og_df_size = combined_df.shape[0]
combined_df = combined_df.loc[(combined_df['duration_minutes']>=.5) & (combined_df['duration_minutes']<=120) ]
combined_df = combined_df.loc[(combined_df['rideable_type']=="electric_bike") ]

In [89]:
og_df_size
new_df_size = combined_df.shape[0]
new_df_size/og_df_size

0.5003393988193603

In [90]:
combined_df.describe()

Unnamed: 0,started_at,ended_at,start_lat,start_lng,end_lat,end_lng,duration,duration_minutes,date,ended_at_dock
count,213758,213758,213758.0,213758.0,213758.0,213758.0,213758,213758.0,213758,213758.0
mean,2024-03-17 06:39:52.901084928,2024-03-17 06:52:50.848150528,38.90912,-77.030062,38.908813,-77.030059,0 days 00:12:57.947066308,12.965784,2024-03-16 15:47:55.547113728,0.69589
min,2024-03-01 00:00:57,2024-03-01 00:09:08,38.76,-77.37,38.71,-77.44,0 days 00:00:30,0.5,2024-03-01 00:00:00,0.0
25%,2024-03-10 11:01:00.750000128,2024-03-10 11:11:36.750000128,38.897164,-77.041643,38.89696,-77.041829,0 days 00:05:28,5.466667,2024-03-10 00:00:00,0.0
50%,2024-03-17 08:30:07.500000,2024-03-17 08:43:40,38.908606,-77.03,38.90849,-77.03,0 days 00:09:22,9.366667,2024-03-17 00:00:00,1.0
75%,2024-03-24 16:31:51.750000128,2024-03-24 16:49:34.249999872,38.922177,-77.01,38.921074,-77.01,0 days 00:15:29,15.483333,2024-03-24 00:00:00,1.0
max,2024-03-31 23:58:41,2024-04-01 00:22:09,39.119862,-76.82,39.119765,-76.82,0 days 01:59:55,119.916667,2024-03-31 00:00:00,1.0
std,,,0.026548,0.034586,0.026679,0.034838,0 days 00:12:57.385453174,12.956424,,0.460031


Create two new dataframes: 
1. Start times combined_df that has the location they started at if it has a station
2. End times combined_df that has the location they ended at if it has a station 

In [91]:
start_stations = combined_df[["start_station_name","start_station_id","start_lat","start_lng"]]
end_stations = combined_df[["end_station_name","end_station_id","end_lat","end_lng"]]
start_stations = start_stations.loc[start_stations['start_station_name'].notna()]
end_stations = end_stations.loc[end_stations['end_station_name'].notna()]
start_stations.rename(columns = {"start_station_name":"station_name",
                                "start_station_id":"station_id",
                                "start_lat":"lat",
                                 "start_lng":"lng"}, inplace=True)

end_stations.rename(columns = {"end_station_name":"station_name",
                                "end_station_id":"station_id",
                                "end_lat":"lat",
                                 "end_lng":"lng"}, inplace=True )
stations = pd.concat([start_stations, end_stations])
stations = stations.drop_duplicates(subset=["station_name","station_id"])

In [92]:
combined_gdf = gpd.GeoDataFrame(stations, geometry=gpd.points_from_xy(stations.lng, stations.lat), crs="EPSG:4326")

# Pull in Weather Data to add to rides

In [93]:
# Function to fetch weather data
def fetch_weather_data(lat, lon, date, api_key):
    url = "https://api.openweathermap.org/data/3.0/onecall/day_summary?"
    params = {
        'lat': lat,
        'lon': lon,
        'date': date.strftime('%Y-%m-%d'),
        'appid': api_key,
        'units':"imperial"
    }
    response = requests.get(url, params=params)
    return response.json()
# Parameters
api_key = "32964f206a33b141c84d7f13e55a92d4"
dc_lat = 38.8950368
dc_lon = -77.0365427
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 3, 31)

# Create a list of dates for each day within the date range
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]


In [94]:
# Fetch data
weather_data = fetch_weather_data(dc_lat, dc_lon, start_date, api_key)

In [95]:
# Fetch data
#daily_data = []
#for date in date_range:
    weather_data = fetch_weather_data(dc_lat, dc_lon, date, api_key)
    daily_data.append({
        'date': weather_data['date'],
        'min_temp': weather_data['temperature']['min'],
        'max_temp': weather_data['temperature']['max'],
        'afternoon_temp': weather_data['temperature']['afternoon'],
        'night_temp': weather_data['temperature']['night'],
        'evening_temp': weather_data['temperature']['evening'],
        'morning_temp': weather_data['temperature']['morning'],
        'humidity': weather_data['humidity']['afternoon'],
        'pressure': weather_data['pressure']['afternoon'],
        'wind_speed': weather_data['wind']['max']['speed'],
        'wind_direction': weather_data['wind']['max']['direction'],
        'precipitation': weather_data['precipitation']['total'],
        'cloud_cover': weather_data['cloud_cover']['afternoon']
    })


IndentationError: unexpected indent (3592158730.py, line 4)

In [96]:
# Process and save data
#weather_df = pd.DataFrame(daily_data)
#weather_df.to_csv("data/raw_weather_data.csv")
weather_df = pd.read_csv("data/raw_weather_data.csv")

In [97]:
weather_df

Unnamed: 0.1,Unnamed: 0,date,min_temp,max_temp,afternoon_temp,night_temp,evening_temp,morning_temp,humidity,pressure,wind_speed,wind_direction,precipitation,cloud_cover
0,0,2024-01-01,37.89,41.83,41.68,40.59,38.08,38.53,69.0,1016.0,13.80,320.0,1.92,100.0
1,1,2024-01-02,33.87,44.47,41.04,38.88,39.76,35.73,57.0,1020.0,18.41,330.0,0.00,75.0
2,2,2024-01-03,31.01,47.46,44.62,33.21,41.90,31.42,54.0,1017.0,9.22,180.0,0.00,40.0
3,3,2024-01-04,31.82,42.89,42.89,39.24,37.83,38.59,59.0,1017.0,21.85,330.0,0.00,100.0
4,4,2024-01-05,25.38,41.05,36.84,30.76,34.93,26.01,54.0,1027.0,13.80,330.0,0.00,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,86,2024-03-27,44.87,48.36,46.92,46.89,48.36,45.07,90.0,1018.0,11.50,110.0,13.55,100.0
87,87,2024-03-28,46.31,52.41,49.89,47.79,51.33,47.59,83.0,1016.0,16.11,330.0,5.87,100.0
88,88,2024-03-29,36.84,61.74,55.80,46.74,59.54,36.97,47.0,1010.0,26.46,320.0,0.00,20.0
89,89,2024-03-30,37.51,64.02,61.00,45.63,64.02,37.78,34.0,1012.0,16.11,350.0,0.80,40.0


In [98]:
# Ensure 'started_at' is a datetime
combined_df['started_at'] = pd.to_datetime(combined_df['started_at'])

# Create a 'date' column in datetime format
combined_df['date'] = combined_df['started_at'].dt.date
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Melt the weather data to make times of the day as rows
weather_melted = weather_df.reset_index().melt(id_vars=['date'], 
                                               value_vars=['afternoon_temp', 'night_temp', 'evening_temp', 'morning_temp'], 
                                               var_name='time_of_day', 
                                               value_name='temperature')
# Ensure 'date' is a datetime
weather_melted['date'] = pd.to_datetime(weather_melted['date'])
weather_df['date'] = pd.to_datetime(weather_df['date'])
# Map the time_of_day to match combined_df format
time_mapping = {
    'afternoon_temp': 'afternoon',
    'night_temp': 'night',
    'evening_temp': 'evening',
    'morning_temp': 'morning',
    
}
weather_melted['time_of_day'] = weather_melted['time_of_day'].map(time_mapping)
weather_melted = pd.merge(weather_melted, weather_df[['date','wind_speed','humidity']] , how='left', left_on=['date'], right_on=['date'])

In [99]:
# Merge based on 'date' and 'time_of_day'
merged_df = pd.merge(combined_df, weather_melted, how='left', left_on=['date', 'time_of_day'], right_on=['date', 'time_of_day'])
# Create target variable: 'ended_at_dock' (1 if ended at a dock, 0 if not)
merged_df['ended_at_dock'] = merged_df['end_station_name'].apply(lambda x: 1 if pd.notnull(x) else 0)

# Start Analysis for Underserved Rides 

The idea for this analysis is that if a rider completes their ride outside of a dock, they either had to find a place nearby to park their bike because a nearby dock was full OR because there simply was not a dock nearby. According to the National Association of City Transportation Officials's [Bike Share Station Siting Guide](https://nacto.org/publication/bike-share-station-siting-guide/), docks should be stationed approximately 1,000 feet, or a 5-minute walk from one another. 

In [100]:
# Create a neighborhood variable that tells us the neighborhood with which a bikeride started. We'll use this as a fixed effect variable 
neighborhoods = gpd.read_file("data/DC_Health_Planning_Neighborhoods.geojson")
merged_gdf =  gpd.GeoDataFrame(merged_df, geometry=gpd.points_from_xy(merged_df.start_lng, merged_df.start_lat), crs="EPSG:4326")
merged_gdf = merged_gdf.sjoin(neighborhoods, how="left")


In [101]:
# Remove all rides that did not start in a DC neighborhood 
merged_gdf = merged_gdf.loc[merged_gdf['OBJECTID'].notnull()]
# only use the columns that I think that we want to use for the design matrix 
mgdf_start_dm = merged_gdf[['rideable_type', 'started_at', 'start_station_name', 'start_station_id', 'member_casual', 'time_of_day', 'day_of_week','day_type', 'falls_on_holiday', 'holiday_weekend', 
                            'temperature', 'wind_speed','humidity','DC_HPN_NAME','end_station_name', 'ended_at_dock']]
# Ensure 'date' is a datetime
mgdf_start_dm['started_at'] = pd.to_datetime(mgdf_start_dm['started_at'])
mgdf_start_dm['started_at_hour'] = mgdf_start_dm['started_at'].dt.hour
# Create target variable: 'ended_at_dock' (1 if ended at a dock, 0 if not)
mgdf_start_dm['ended_at_dock'] = mgdf_start_dm['end_station_name'].apply(lambda x: 1 if pd.notnull(x) else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mgdf_start_dm['started_at'] = pd.to_datetime(mgdf_start_dm['started_at'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mgdf_start_dm['started_at_hour'] = mgdf_start_dm['started_at'].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mgdf_start_dm['ended_at_dock'] = mgdf_start_dm['end_station_

In [109]:
import pandas as pd
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, classification_report
# Set the MLFlow tracking URI (optional, use default local file-based store if not set)
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Set the experiment name (optional)
mlflow.set_experiment("Bike_Rides_Prediction")

<Experiment: artifact_location='mlflow-artifacts:/735336610594310444', creation_time=1721605587582, experiment_id='735336610594310444', last_update_time=1721605587582, lifecycle_stage='active', name='Bike_Rides_Prediction', tags={}>

In [110]:
# Split the data into training and testing sets
X = mgdf_start_dm.drop(['started_at','end_station_name','ended_at_dock'], axis=1)
y = mgdf_start_dm['ended_at_dock']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Engineering 

In [111]:
X.head()

Unnamed: 0,rideable_type,start_station_name,start_station_id,member_casual,time_of_day,day_of_week,day_type,falls_on_holiday,holiday_weekend,temperature,wind_speed,humidity,DC_HPN_NAME,started_at_hour
1,electric_bike,5th & K St NW,31600.0,member,morning,Sunday,weekend,False,False,46.35,8.01,57.0,CHINATOWN,10
2,electric_bike,8th & K St NE,31660.0,member,afternoon,Saturday,weekend,False,False,57.52,16.11,46.0,UNION STATION,15
3,electric_bike,Georgia & Missouri Ave NW,31411.0,member,night,Sunday,weekend,False,False,37.53,20.71,41.0,BRIGHTWOOD,22
4,electric_bike,Georgia & Missouri Ave NW,31411.0,member,afternoon,Tuesday,work-day,False,False,51.66,11.5,55.0,BRIGHTWOOD,16
5,electric_bike,Maryland Ave & E St NE,31640.0,member,afternoon,Saturday,weekend,False,False,46.08,19.57,92.0,KINGMAN PARK,15


In [112]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define columns to be used in the model
numerical_features = [ 'temperature', 'humidity', 'wind_speed']
categorical_features = ['rideable_type','started_at_hour','start_station_id', 'time_of_day', 'member_casual', 
                        'day_of_week', 'day_type', 'falls_on_holiday', 'holiday_weekend','DC_HPN_NAME']

# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [113]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Add classifier to the pipeline
pipeline.steps.append(('classifier', RandomForestClassifier(random_state=42)))

In [114]:
# Train the model
pipeline.fit(X_train, y_train)

In [115]:
# Evaluate the model
y_pred = pipeline.predict(X_test)


In [116]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.41      0.46     11453
           1       0.77      0.85      0.81     26866

    accuracy                           0.72     38319
   macro avg       0.66      0.63      0.64     38319
weighted avg       0.70      0.72      0.71     38319



## Analysis

    - The model has high precision (0.90) and recall (0.94) for class 1 (ending at a dock), indicating it performs well in predicting rides that end at a dock.
    - The model struggles with class 0 (not ending at a dock), with lower precision (0.54) and recall (0.41). This suggests the model often misclassifies rides that end outside a dock or misses them altogether.
    - The imbalance in support (11,534 vs. 64,481) indicates class imbalance, which could be affecting model performance.

In [117]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [118]:
# Train the model on the resampled data
pipeline.fit(X_train, y_train)

In [119]:
# Evaluate the model on the test data
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.44      0.48     11453
           1       0.78      0.83      0.80     26866

    accuracy                           0.72     38319
   macro avg       0.65      0.64      0.64     38319
weighted avg       0.70      0.72      0.71     38319



In [136]:
def train_and_log_model(model_name, model, X_train, y_train, X_test, y_test, is_regression=False):
    with mlflow.start_run(run_name=model_name):
        # Define preprocessing pipeline
        numerical_features = ['temperature', 'humidity', 'wind_speed']
        categorical_features = ['rideable_type', 'started_at_hour', 'start_station_id', 'time_of_day', 'member_casual', 
                                'day_of_week', 'day_type', 'falls_on_holiday', 'holiday_weekend', 'DC_HPN_NAME']

        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(handle_unknown='ignore')
        polynomial_transformer = PolynomialFeatures(degree=2, include_bias=False)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            #('polynomial', polynomial_transformer),
            ('classifier', model)
        ])

        # Train the model
        pipeline.fit(X_train, y_train)
        # Predict and evaluate
        y_pred = pipeline.predict(X_test)

        if is_regression:
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            # Log metrics
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2", r2)

            print(f"Model: {model_name}")
            print(f"Mean Squared Error: {mse}")
            print(f"Mean Absolute Error: {mae}")
            print(f"R2 Score: {r2}")

        else:
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            # Log metrics
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)

            print(f"Model: {model_name}")
            print(f"Accuracy: {accuracy}")
            print(f"Precision: {precision}")
            print(f"Recall: {recall}")
            print(f"F1 Score: {f1}")
            print(classification_report(y_test, y_pred))


In [138]:
random_state=42
# LogisticRegression Model
logit_model = LogisticRegression()
train_and_log_model("LogisticRegression", logit_model, X_train, y_train, X_test, y_test)

Model: LogisticRegression
Accuracy: 0.7129883347686526
Precision: 0.8213707064160726
Recall: 0.7547829970967022
F1 Score: 0.7866702874655701
              precision    recall  f1-score   support

           0       0.52      0.61      0.56     11453
           1       0.82      0.75      0.79     26866

    accuracy                           0.71     38319
   macro avg       0.67      0.68      0.67     38319
weighted avg       0.73      0.71      0.72     38319



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [139]:
# Train and log Linear Regression, Ridge, and Lasso
lr_model = LinearRegression()
train_and_log_model("LinearRegression", lr_model, X_train, y_train, X_test, y_test, is_regression=True)

Model: LinearRegression
Mean Squared Error: 0.20296838238028225
Mean Absolute Error: 0.4000631002102704
R2 Score: 0.031422328372912256


In [123]:
ridge_model = Ridge()
train_and_log_model("RidgeRegression", ridge_model, X_train, y_train, X_test, y_test, is_regression=True)

Model: RidgeRegression
Mean Squared Error: 0.18244513563221934
Mean Absolute Error: 0.34528072038467994
R2 Score: 0.12936053094587652


In [124]:
lasso_model = Lasso()
train_and_log_model("LassoRegression", lasso_model, X_train, y_train, X_test, y_test, is_regression=True)

Model: LassoRegression
Mean Squared Error: 0.20955655689055377
Mean Absolute Error: 0.4183502798289483
R2 Score: -1.6847781509454762e-05


In [125]:
# Train and log Linear Regression, Ridge, and Lasso
lr_model = LinearRegression()
train_and_log_model("LinearRegression_with_Polynomial", lr_model, X_train, y_train, X_test, y_test, is_regression=True)

Model: LinearRegression_with_Polynomial
Mean Squared Error: 0.18815670973158494
Mean Absolute Error: 0.34790502251452626
R2 Score: 0.10210454615843245


In [126]:
ridge_model = Ridge()
train_and_log_model("RidgeRegression_with_Polynomial", ridge_model, X_train, y_train, X_test, y_test, is_regression=True)

Model: RidgeRegression_with_Polynomial
Mean Squared Error: 0.18244513563221934
Mean Absolute Error: 0.34528072038467994
R2 Score: 0.12936053094587652


In [127]:
lasso_model = Lasso()
train_and_log_model("LassoRegression_with_Polynomial", lasso_model, X_train, y_train, X_test, y_test, is_regression=True)

Model: LassoRegression_with_Polynomial
Mean Squared Error: 0.20955655689055377
Mean Absolute Error: 0.4183502798289483
R2 Score: -1.6847781509454762e-05


In [128]:
# Train and log Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor()
train_and_log_model("GradientBoostingRegressor", gbr_model, X_train, y_train, X_test, y_test, is_regression=True)

Model: GradientBoostingRegressor
Mean Squared Error: 0.1774809522767321
Mean Absolute Error: 0.35861337802724136
R2 Score: 0.1530499208873065


In [129]:
# Train and log RandomForest with class weight adjustment
rf_model = RandomForestClassifier(class_weight='balanced')
train_and_log_model("RandomForest_with_Class_Weight", rf_model, X_train, y_train, X_test, y_test)


Model: RandomForest_with_Class_Weight
Accuracy: 0.7104830501839818
Precision: 0.7839755131436802
Recall: 0.8103550956599419
F1 Score: 0.7969470678673403
              precision    recall  f1-score   support

           0       0.52      0.48      0.50     11453
           1       0.78      0.81      0.80     26866

    accuracy                           0.71     38319
   macro avg       0.65      0.64      0.65     38319
weighted avg       0.70      0.71      0.71     38319



In [131]:
import pickle

In [140]:
# Save the various models and pieplines
with open('models/rf_pipeline.pkl', 'wb') as file:
    pickle.dump((pipeline,rf_model), file)

with open('models/logit_pipeline.pkl', 'wb') as file:
    pickle.dump((pipeline,logit_model), file)

with open('models/lasso_pipeline.pkl', 'wb') as file:
    pickle.dump((pipeline,lasso_model), file)
    
with open('models/ridge_pipeline.pkl', 'wb') as file:
    pickle.dump((pipeline,ridge_model), file)

with open('models/gbr_pipeline.pkl', 'wb') as file:
    pickle.dump((pipeline,gbr_model), file)