In [1]:
import utils.extract_df as extract_df
import utils.transform as transform 
import utils.clustering as clustering 
import utils.main as main
import streamlit as st
import pandas as pd
import utils.model as model



In [2]:
df = extract_df.readcsv('data/taxi_data.csv')

In [3]:
nyc = extract_df.readshp('data/nyc-boundaries/geo_export_9ca5396d-336c-47af-9742-ab30cd995e41.shp')

In [4]:
df_transform = transform.dataTransformation(df = extract_df.readcsv('data/taxi_data.csv'), 
nyc = extract_df.readshp('data/nyc-boundaries/geo_export_9ca5396d-336c-47af-9742-ab30cd995e41.shp'))

In [5]:
df_transform = df_transform.transform()

In [6]:
(pd.to_datetime(df_transform.pickup_datetime.min()), pd.to_datetime(df_transform.pickup_datetime.max()))

(Timestamp('2014-01-01 00:24:00+0000', tz='UTC'),
 Timestamp('2015-06-30 23:57:04+0000', tz='UTC'))

In [7]:
df_transform.columns

Index(['dropoff_latitude', 'dropoff_longitude', 'fare_amount', 'feat01',
       'feat02', 'feat03', 'feat04', 'feat05', 'feat06', 'feat07', 'feat08',
       'feat09', 'feat10', 'passenger_count', 'pickup_datetime',
       'pickup_latitude', 'pickup_longitude', 'passenger_big_group',
       'fare_amount_log', 'year', 'month', 'day', 'hour', 'trip_distance'],
      dtype='object')

In [8]:
df_transform.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88175 entries, 0 to 89999
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   dropoff_latitude     88175 non-null  float64            
 1   dropoff_longitude    88175 non-null  float64            
 2   fare_amount          88175 non-null  float64            
 3   feat01               88175 non-null  float64            
 4   feat02               88175 non-null  float64            
 5   feat03               88175 non-null  float64            
 6   feat04               88175 non-null  float64            
 7   feat05               88175 non-null  float64            
 8   feat06               88175 non-null  float64            
 9   feat07               88175 non-null  float64            
 10  feat08               88175 non-null  float64            
 11  feat09               88175 non-null  float64            
 12  feat10               88

## Clustering step 

In [9]:
df_clustered = clustering.pickUpCluster(df_transform).clusterCreated()

In [10]:
df_clustered.head()

Unnamed: 0,dropoff_latitude,dropoff_longitude,fare_amount,feat01,feat02,feat03,feat04,feat05,feat06,feat07,...,pickup_latitude,pickup_longitude,passenger_big_group,fare_amount_log,year,month,day,hour,trip_distance,pickup_cluster
0,40.76855,-73.862065,52.713,0.607633,0.680994,0.869333,0.359081,0.283538,0.898003,0.481185,...,40.75957,-73.985715,1,3.964862,2014,1,8,6,10.461512,0
1,40.746906,-73.990494,19.35,0.353808,0.555256,0.946294,0.53053,0.453938,0.70857,0.161038,...,40.759457,-73.972038,0,2.962692,2015,2,16,20,2.089068,0
2,40.697496,-73.984946,24.85,0.248761,0.271752,0.418165,0.368993,0.362234,0.257532,0.710595,...,40.70586,-74.013626,0,3.212858,2014,3,18,13,2.590397,5
3,40.767617,-73.959482,16.6,0.606718,0.809065,0.826723,0.228102,0.819767,0.859372,0.014095,...,40.77983,-73.955313,0,2.809403,2014,3,20,18,1.402666,2
5,40.724657,-73.994457,14.95,0.386871,0.657538,0.861953,0.155679,0.928781,0.935444,0.381414,...,40.733822,-73.991025,0,2.704711,2014,4,10,22,1.05934,4


In [11]:
# df_clustered.to_pickle('../model/clustered_data.pkl')

In [12]:
# drop datetime column
df_modelling = df_clustered.drop(columns=['pickup_datetime'], inplace=False)
# model filepahts
model_RF_path = "data/models/RF_model.pkl"
model_RF_log_path = "data/models/RF_model.pkl"
# define models
RF = model.Model(model_RF_path, df_modelling, 'fare_amount')
RF_log = model.Model(model_RF_log_path, df_modelling, 'fare_amount_log')
# fit models
RF.load_model()
RF.prepare_data()
RF.fit_model()
RF_log.load_model()
RF_log.prepare_data()
RF_log.fit_model()
# predict
# TODO just for prints
print(RF.predict(RF.X_test))
print(RF_log.predict(RF_log.X_test))
# train score
print(RF.train_score())
print(RF_log.train_score())
# test score
print(RF.test_score())
print(RF_log.test_score())

[20.65610312 24.44235232 21.87340391 ... 20.43281534 20.72414598
 20.43423326]
[3.0128286  3.18655467 3.08723279 ... 2.95702168 3.02323639 2.95653851]
6.484346368899824
0.2046738330124263
7.404303434315135
0.2074535791643013


## Run the application

In [23]:
! streamlit run 1_Homepage.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.1.13:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m
2024-06-02 22:49:14.270 `st.cache` is deprecated. Please use one of Streamlit's new caching commands,
`st.cache_data` or `st.cache_resource`.

More information [in our docs](https://docs.streamlit.io/library/advanced-features/caching).
^C
[34m  Stopping...[0m
