In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import utils.extract_df as extract_df
import utils.transform as transform
import utils.clustering as clustering
import utils.feature_engineering as feature_engineering
import utils.model as model

In [None]:
    # extract taxi_data.csv
    filepathcsv = "data/taxi_data.csv"
    df = extract_df.readcsv(filepathcsv)

    # extract nyc.shp
    filepathshp ="data/nyc-boundaries/geo_export_9ca5396d-336c-47af-9742-ab30cd995e41.shp"
    nyc = extract_df.readshp(filepathshp)

    # transform & data cleaning
    transformer = transform.dataTransformation(df,nyc)
    transformedDf = transformer.transform()

    # feature engineering
    filepathtemp = "data/NYC_Weather_2014_2020.csv"
    temperature_df = extract_df.readcsv(filepathtemp)
    merged_df = feature_engineering.add_temperature(transformedDf, temperature_df)

    # clustering
    cluster = clustering.pickUpCluster(merged_df)
    df = cluster.clusterCreated()
   

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.to_pickle('../model/clean_data/clustered_data.pkl')

In [None]:
# drop datetime column
df_modelling = df.drop(columns=['pickup_datetime','date'], inplace=False)
# model filepahts
model_RF_path = "data/models/RF_model.pkl"
model_RF_log_path = "data/models/RF_model_log.pkl"
# define models
RF = model.Model(model_RF_path, df_modelling, 'fare_amount')
RF_log = model.Model(model_RF_log_path, df_modelling, 'fare_amount_log')
# fit models
RF.load_model()
RF.prepare_data()
RF.fit_model()
RF_log.load_model()
RF_log.prepare_data()
RF_log.fit_model()
# predict
# TODO just for prints
print(RF.predict(RF.X_test))
print(RF_log.predict(RF_log.X_test))
# train score
print(RF.train_score())
print(RF_log.train_score())
# test score
print(RF.test_score())
print(RF_log.test_score())

In [None]:
import shap
df_modelling = df.drop(columns=['pickup_datetime','date'], inplace=False)
# One hot encode passenger_big_group, pickup_cluster
df_modelling = pd.get_dummies(df_modelling, columns=['passenger_big_group','pickup_cluster'])
# model filepahts
model_XGB_path = "data/models/XGB_model.pkl"
# XGB = joblib.load(model_XGB_path)
# XGB.fit()
# define models
XGB = model.Model(model_XGB_path, df_modelling, 'fare_amount')
# fit models
XGB.load_model()
XGB.prepare_data()
XGB = XGB.fit_model()
explainer = shap.Explainer(XGB)
print(df_modelling.drop(['fare_amount','fare_amount_log'],axis=1).columns)
# display(df_modelling.drop(['fare_amount'],axis=1).iloc[:26452500])
shap_values = explainer(df_modelling.drop(['fare_amount', 'fare_amount_log'],axis=1))
# visualize the first prediction's explanation with a force plot
shap.plots.force(shap_values[0])

In [None]:
import pickle
with open(r"data/models/shap_values_XGB.pkl", "wb") as output_file:
     pickle.dump(shap_values, output_file)


In [None]:
import pickle
with open(r"data/models/shap_values_XGB.pkl", "rb") as input_file:
    shap_values_pkl = pickle.load(input_file)

In [None]:
import shap
shap.plots.initjs()
shap.plots.force(shap_values_pkl[0])

In [None]:
shap.plots.waterfall(shap_values_pkl[0])

In [None]:
# param_grid = {
#         'n_estimators': [100, 200],
#         'max_features': [1.0, 'sqrt', 'log2'],
#         'max_depth' : [4,8],
#         'min_samples_split': [2],
#     }
param_grid = {
    'n_estimators': [1000], #Irena 1000
    'max_features': [1.0,'sqrt'], #Irena tuned this one
    'max_depth' : [8],
    'min_samples_split': [2,4,8], #Irena tuned this one
}

## Run the application

In [None]:
! streamlit run 1_Homepage.py