In [None]:
from IPython.display import display, HTML
import sys
display(HTML("<style>.container { width:100% !important; }</style>"))
sys.path.append('/opt/homebrew/lib/python3.11/site-packages')

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dataset_list = ["green_tripdata_2023-02.parquet",
                "green_tripdata_2023-01.parquet",
                "green_tripdata_2022-12.parquet",
                "green_tripdata_2022-11.parquet",
                "green_tripdata_2022-10.parquet",
                "green_tripdata_2022-09.parquet",
                "green_tripdata_2022-08.parquet",
                "green_tripdata_2022-07.parquet",
                "green_tripdata_2022-06.parquet",
                "green_tripdata_2022-05.parquet",
                "green_tripdata_2022-04.parquet",
                "green_tripdata_2022-03.parquet",
                "green_tripdata_2022-02.parquet",
                "green_tripdata_2022-01.parquet"]

In [None]:
df_list = []
for name in dataset_list:
    df_list.append(pd.read_parquet(f"https://d37ci6vzurychx.cloudfront.net/trip-data/{name}"))

In [None]:
df_full = pd.concat(df_list)

In [None]:
len(df_full)

In [None]:
def filter_data(df):
    
    def _filter_ratecodes(df):
        
        return df[df["RatecodeID"] == 1]
    
    def _filter_payment_type(df):
        
        return df[(df["payment_type"] == 1)]
    
    def _filter_trip_type(df):
        
        return df[df["trip_type"] == 1]
    
    return _filter_ratecodes(_filter_payment_type(_filter_trip_type(df)))

In [None]:
df_filtered = filter_data(df_full)

In [None]:
len(df_filtered)

In [None]:
df_selected = df_filtered[["lpep_pickup_datetime",
                           "lpep_dropoff_datetime",
                           "PULocationID",
                           "DOLocationID",
                           "trip_distance",
                           "fare_amount",
                           "total_amount"]]

In [None]:
sns.histplot(df_selected["trip_distance"], label="trip_distance")

In [None]:
sns.histplot(df_selected["fare_amount"], label="fare_amount")

In [None]:
df_selected["travel_time"] = ((df_selected["lpep_dropoff_datetime"] - df_selected["lpep_pickup_datetime"])
                              .apply(lambda x: x.total_seconds() / 60))

In [None]:
df_selected["PULocationID"] = df_selected["PULocationID"].apply(lambda x: str(x))
df_selected["DOLocationID"] = df_selected["DOLocationID"].apply(lambda x: str(x))

In [None]:
df_selected["PU_DO"] = df_selected["PULocationID"] + "_" + df_selected["DOLocationID"]

In [None]:
df_selected = df_selected[(df_selected["travel_time"] > 1) & (df_selected["travel_time"] < 600)]

In [None]:
df_selected = df_selected[df_selected["fare_amount"] > 0]

In [None]:
df_selected = df_selected[df_selected["trip_distance"] < 100]

In [None]:
df_selected["pickup_year"] = df_selected["lpep_pickup_datetime"].apply(lambda x: x.year)
df_selected["pickup_month"] = df_selected["lpep_pickup_datetime"].apply(lambda x: x.month)
df_selected["pickup_day"] = df_selected["lpep_pickup_datetime"].apply(lambda x: x.day)
df_selected["pickup_hour"] = df_selected["lpep_pickup_datetime"].apply(lambda x: x.hour)
df_selected["pickup_minute"] = df_selected["lpep_pickup_datetime"].apply(lambda x: x.minute)
df_selected["pickup_dow"] = df_selected["lpep_pickup_datetime"].apply(lambda x: x.day_of_week)

In [None]:
df_selected["PULocationID"] = df_selected["PULocationID"].apply(lambda x: int(x))
df_selected["DOLocationID"] = df_selected["DOLocationID"].apply(lambda x: int(x))

In [None]:
len(df_selected["PU_DO"].unique())

In [None]:
df_train = df_selected[df_selected["pickup_year"] == 2022]

In [None]:
PU_DO_map = (df_train[["PU_DO", "fare_amount"]]
 .groupby("PU_DO")
 .count()
 .reset_index()
 .rename(columns = {"fare_amount": "count"})
 .sort_values(by="count", ascending=False)
 .reset_index()
 .drop(columns=["index"])
 .reset_index()
 .rename(columns = {"index": "PU_DO_num"}))

PU_DO_map["PU_DO_num"] = PU_DO_map["PU_DO_num"] + 1
PU_DO_map["PU_DO_num"] = PU_DO_map["PU_DO_num"].apply(lambda x: x if x < 10000 else 10000)

In [None]:
PU_DO_map_dict = PU_DO_map[["PU_DO", "PU_DO_num"]].set_index("PU_DO")["PU_DO_num"].to_dict()

In [None]:
with open('data/PU_DO_map.pkl', 'wb') as f:
    pickle.dump(PU_DO_map_dict, f)

In [None]:
df_train = pd.merge(
    df_train,
    PU_DO_map,
    how="inner",
    on=["PU_DO"]
)

In [None]:
X_train = df_train[["PULocationID", "DOLocationID", "trip_distance", "pickup_month", "pickup_day", "pickup_hour", "pickup_minute", "pickup_dow", "PU_DO_num"]]
Y_train_fare = df_train["fare_amount"]
Y_train_total = df_train["total_amount"]
Y_train_time = df_train["travel_time"]

In [None]:
# create regressor object
regressor_fare = RandomForestRegressor(n_estimators=10, random_state=0)
 
# fit the regressor with x and y data
regressor_fare.fit(X_train, Y_train_fare)

In [None]:
# create regressor object
regressor_time = RandomForestRegressor(n_estimators=10, random_state=0)
 
# fit the regressor with x and y data
regressor_time.fit(X_train, Y_train_time)

In [None]:
# create regressor object
regressor_total = RandomForestRegressor(n_estimators=10, random_state=0)
 
# fit the regressor with x and y data
regressor_total.fit(X_train, Y_train_total)

In [None]:
with open('data/model_fare_amount.pkl', 'wb') as f:
    pickle.dump(regressor_fare, f)

In [None]:
with open('data/model_travel_time.pkl', 'wb') as f:
    pickle.dump(regressor_time, f)

In [None]:
with open('data/model_total_amount.pkl', 'wb') as f:
    pickle.dump(regressor_total, f)

In [None]:
def calculate_evaluation_metrics(model, dataset, features, col_predict):
    
    r2 = r2_score(model.predict(dataset[features]), dataset[col_predict])
    mae = mean_absolute_error(model.predict(dataset[features]), dataset[col_predict])
    mse = mean_squared_error(model.predict(dataset[features]), dataset[col_predict])
    
    print(f"r2: {r2}, mae: {mae}, mse: {mse}")
    
    return r2, mae, mse

In [None]:
calculate_evaluation_metrics(regressor_fare, df_train, ["PULocationID", "DOLocationID", "trip_distance", "pickup_month", "pickup_day", "pickup_hour", "pickup_minute", "pickup_dow", "PU_DO_num"], "fare_amount")

In [None]:
calculate_evaluation_metrics(regressor_time, df_train, ["PULocationID", "DOLocationID", "trip_distance", "pickup_month", "pickup_day", "pickup_hour", "pickup_minute", "pickup_dow", "PU_DO_num"], "travel_time")

In [None]:
calculate_evaluation_metrics(regressor_total, df_train, ["PULocationID", "DOLocationID", "trip_distance", "pickup_month", "pickup_day", "pickup_hour", "pickup_minute", "pickup_dow", "PU_DO_num"], "total_amount")

In [None]:
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
plot_feature_importance(regressor_time.feature_importances_, X_train.columns, 'RANDOM FOREST')