In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pandas import read_parquet
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pickle

## Step 1: get predict results and save them into dataframe for each zones

In [3]:
unique_values=[4, 12, 13, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90, 100, 107, 113, 114, 116, 120, 125, 127, 128, 137, 
               140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224, 229, 230, 
               231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 262, 263]

In [4]:
# set an list for saving predict results
predictions = []
# input of the model, here just a sample
X_input = pd.DataFrame(np.array([[-11.2, 1, 2]]), columns=['temp', 'month', 'time_of_week'])

# loop all model 
for zone_id in unique_values:
    # load model
    filename = 'model-' + str(zone_id) + '.pkl'
    with open(filename, 'rb') as f:
        model = pickle.load(f)
    
    # predict
    prediction = model.predict(X_input)[0]
    # save reslut and zone ID into dictionary--prediction
    predictions.append({'Taxi_Zone_ID': zone_id, 'calm rate': prediction})

# transform dictionary into dataframe
df_predictions = pd.DataFrame(predictions)

In [5]:
df_predictions.head(5)

Unnamed: 0,Taxi_Zone_ID,calm rate
0,4,0.401397
1,12,0.445275
2,13,0.265881
3,24,0.228572
4,41,0.08886


## Step 2: merge the results with street data

In [6]:
df_street=pd.read_csv('new_street_busyness.csv')

In [7]:
def get_streets(df, street_zones):
    """
    Input: table with [taxi_zone] and [busyness_index]
        df = result of prediction
        street_zones = mapping table from 

    Output: table with street and adjusted busyness_index
    """


    # Perform the join.
    output_df =  pd.merge(df,street_zones,how='right',on='Taxi_Zone_ID')

    # Adjust busyness by the highway factor.
    output_df['street_calm_rate'] = 1 - (1 - output_df['calm rate']) * output_df['highway_factor']

    return output_df

new_result=get_streets(df_predictions,df_street)

In [9]:
new_result=new_result.drop(columns='Unnamed: 0')

In [10]:
new_result.head(5)

Unnamed: 0,Taxi_Zone_ID,calm rate,osmid,highway,geometry,highway_factor,street_calm_rate
0,43.0,0.129196,147089927,path,"LINESTRING (-73.963576 40.79072, -73.963526 40...",0.6,0.477517
1,43.0,0.129196,46613689,bridleway,"LINESTRING (-73.963576 40.79072, -73.963602 40...",0.4,0.651678
2,43.0,0.129196,301283869,footway,"LINESTRING (-73.97436 40.774776, -73.974179 40...",0.6,0.477517
3,43.0,0.129196,1081028223,footway,"LINESTRING (-73.97436 40.774776, -73.974312 40...",0.6,0.477517
4,43.0,0.129196,426425334,footway,"LINESTRING (-73.97436 40.774776, -73.974413 40...",0.6,0.477517


Description

Taxi_Zone_ID: identifitier of each taxi zone

calm rate: calm rate of each taxi zone, can be dropped if useless in visualising on map

osmid: stands for "open street map id", identifiter of highway,can be drop if useless

geometry: geolocation of each path

highway_factor: factor for calculating street_calm_rate, can be drop if useless

street_calm_rate: rate of each road