# Test Model

In [2]:
import pandas as pd
from pandas import read_parquet

import geopandas as gpd
from shapely import wkt

from shapely.geometry import Point

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor

import ast

In [35]:
df = pd.read_parquet("data_for_model")

In [36]:
df.head()

Unnamed: 0,datetime,station id,bike_busyness,Taxi_Zone_ID,taxi_busyness,SegmentID,volume_busyness
0,2018-10-14 15:15:00,3472,0.182482,68,0.210904,24244,0.011254
1,2018-10-14 15:15:00,3472,0.182482,68,0.210904,-31883,0.009149
2,2018-10-14 15:15:00,3472,0.182482,68,0.210904,24244,0.011254
3,2018-10-14 15:15:00,3472,0.182482,68,0.210904,-31883,0.009149
4,2018-10-14 15:15:00,3472,0.182482,68,0.210904,24244,0.011254


In [37]:
df['average_busyness'] = df[['bike_busyness', 'taxi_busyness', 'volume_busyness']].mean(axis=1)

"""
df = df.rename(columns={
    'busyness_x': 'bike_busyness',
    'busyness_y': 'taxi_busyness',
    'busyness': 'volume_busyness'
})
"""

"\ndf = df.rename(columns={\n    'busyness_x': 'bike_busyness',\n    'busyness_y': 'taxi_busyness',\n    'busyness': 'volume_busyness'\n})\n"

## Factor for street "highway"

In [24]:
def safe_loads(row):
    try:
        return wkt.loads(row)
    except Exception:
        print(f"Error parsing row: {row}")
        return None  # or some other value

In [25]:
street_zones = pd.read_parquet("street_zones_simplified.parquet")

In [26]:
street_zones['geometry'] = street_zones['geometry'].apply(safe_loads)
street_zones['geometry'] = gpd.GeoSeries(street_zones['geometry'])

In [27]:
street_zones['geometry'].dtype

<geopandas.array.GeometryDtype at 0x170307bd0>

In [28]:
street_zones.head()

Unnamed: 0,highway,geometry,index_right,Taxi_Zone_ID
49464,path,"LINESTRING (-73.99998 40.70762, -73.99960 40.7...",210.0,209.0
87088,footway,"LINESTRING (-74.00009 40.70774, -74.00010 40.7...",210.0,209.0
5423,motorway_link,"LINESTRING (-74.00017 40.70857, -74.00015 40.7...",210.0,209.0
5423,motorway_link,"LINESTRING (-74.00017 40.70857, -74.00015 40.7...",41.0,45.0
7755,motorway,"LINESTRING (-73.99685 40.70903, -73.99742 40.7...",210.0,209.0


In [29]:
street_zones = street_zones.drop('index_right', axis=1)

In [30]:
street_zones["highway"].unique()

array(['path', 'footway', 'motorway_link', 'motorway', 'cycleway',
       'trunk', 'primary', 'steps', 'residential', 'secondary', 'service',
       'pedestrian', 'unclassified', 'primary_link', 'living_street',
       'construction', 'elevator', 'trunk_link', 'secondary_link',
       'tertiary', 'corridor', 'tertiary_link', 'track', 'bridleway',
       'proposed'], dtype=object)

In [42]:
# Factor based on path description (https://wiki.openstreetmap.org/wiki/Key:highway)
highway_factor = {
    'path': 0.6,
    'bridleway': 0.4,
    'footway': 0.6, 
    'cycleway': 0.6, 
    'secondary': 1, 
    'residential': 0.6, 
    'primary':1.3, 
    'tertiary':0.8, 
    'service':0.4, 
    'trunk':1.4, 
    'unclassified': 0.6, 
    'pedestrian': 0.6, 
    'tertiary_link': 0.8, 
    'living_street': 0.4, 
    'secondary_link':1, 
    'primary_link': 1.3, 
    'trunk_link': 1.4,
    'motorway': 1,
    'motorway_link': 1,
    'steps': 0.4,
    'track': 0.6,
    'elevator': 0.6,
    'construction': 1.2,
    'corridor': 0.8,
    'proposed': 0.8,
    }

In [43]:
highway_types = list(highway_factor.keys())

In [44]:
def find_and_average(row):
    highway_values = row['highway']
    if highway_values.startswith('['):  # if the value is a list
        highway_values = ast.literal_eval(highway_values)
    else:  # if the value is a single string
        highway_values = [highway_values]
    vals = [highway_factor[k] for k in highway_values if k in highway_types]
    return sum(vals) / len(vals) if vals else None

street_zones['highway_factor'] = street_zones.apply(find_and_average, axis=1)

print(street_zones.head())

             highway                                           geometry  \
49464           path  LINESTRING (-73.99998 40.70762, -73.99960 40.7...   
87088        footway  LINESTRING (-74.00009 40.70774, -74.00010 40.7...   
5423   motorway_link  LINESTRING (-74.00017 40.70857, -74.00015 40.7...   
5423   motorway_link  LINESTRING (-74.00017 40.70857, -74.00015 40.7...   
7755        motorway  LINESTRING (-73.99685 40.70903, -73.99742 40.7...   

       Taxi_Zone_ID  highway_factor  
49464         209.0             0.6  
87088         209.0             0.6  
5423          209.0             1.0  
5423           45.0             1.0  
7755          209.0             1.0  


In [45]:
street_zones["highway_factor"].unique()

array([0.6, 1. , 1.4, 1.3, 0.4, 1.2, 0.8, nan])

In [46]:
street_zones[street_zones['highway_factor'].isna()]

Unnamed: 0,highway,geometry,Taxi_Zone_ID,highway_factor
31120,corridor,"LINESTRING (-73.97798 40.75836, -73.97810 40.7...",161.0,
31124,corridor,"LINESTRING (-73.97788 40.75850, -73.97775 40.7...",161.0,
86299,proposed,"LINESTRING (-73.93459 40.80998, -73.93445 40.8...",74.0,
82561,proposed,"LINESTRING (-73.92976 40.84809, -73.92976 40.8...",120.0,
82561,proposed,"LINESTRING (-73.92976 40.84809, -73.92976 40.8...",243.0,


In [57]:
street_zones['geometry'] = street_zones['geometry'].astype(str)
street_zones.to_parquet('street_zones_factor.parquet', engine='pyarrow')

### Mapping

A function that creates from prediction table a street edge table
<br>
<br>
Input: table with [taxi_zone] and [busyness_index]<br>
Output: table with street and adjusted busyness_index

In [59]:
def get_streets(df, street_zones):
    """
    Input: table with [taxi_zone] and [busyness_index]
        df = result of prediction
        street_zones = mapping table from 

    Output: table with street and adjusted busyness_index
    """


    # Perform the join.
    output_df = street_zones.merge(df[['Taxi_Zone_ID', 'busyness']], on='Taxi_Zone_ID', how='left')

    # Adjust busyness by the highway factor.
    output_df['street_busyness'] = 1 - (1 - output_df['busyness']) * output_df['highway_factor']

    return output_df


street_zones = pd.read_parquet("street_zones_factor.parquet")

new_result = get_streets(df, street_zones)

KeyError: "['busyness'] not in index"

In [None]:
simplified_predict = pd.read_csv("predict_test.csv")
simplified_predict.shape

(37914, 6)