In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [7]:
import json
lion = json.load(open('lion.geojson'))

In [62]:
lion_df = pd.DataFrame(f['properties'] for f in lion['features'])

In [104]:
lion_l = lion_df.filter(['FromLeft', 'FromRight', 'ToLeft', 'ToRight', 'SegmentID', 'StreetCode'], axis=1).copy()
lion_r = lion_l.copy()
lion_l['From'] = lion_l['FromLeft']
lion_l['To'] = lion_l['ToLeft']
lion_l['Side'] = 'L'
lion_r['From'] = lion_r['FromRight']
lion_r['To'] = lion_r['ToRight']
lion_r['Side'] = 'R'

lion_split = lion_l.append(lion_r)
lion_split['Even'] = (lion_split['From'] % 2 == 0)
lion_split = lion_split.filter(['StreetCode', 'SegmentID', 'From', 'To', 'Side', 'Even'])

In [105]:
lion_split = lion_split.sort_values(['From', 'StreetCode', 'Even'])

In [187]:
%%time
parking_violations = pd.read_csv('parking-violations.csv', nrows=1000000, low_memory=False)

CPU times: user 6.43 s, sys: 2.15 s, total: 8.58 s
Wall time: 9.04 s


In [131]:
import shapely.geometry
lion_geom = {
    f['properties']['SegmentID'] : shapely.geometry.MultiLineString(f['geometry']['coordinates'])
    for f in lion['features']
}

In [33]:
from pyproj import CRS, Transformer
wgs84 = CRS.from_epsg(4326)
nySP = CRS.from_epsg(2263)
transformer = Transformer.from_crs(nySP, wgs84)

In [188]:
%%time
CONVERT_BOROUGH = {
    'K': 3,
    'Q': 4,
    'M': 1,
    'B': 2,
    'ST': 5,
    'QN': 4,
    'BK': 3,
    'NY': 1,
    'BX': 2,
    'R': 5,
}
def get_full_street_code(row):
    stc1 = int(row['Street Code1'])
    if stc1 == 0:
        return -1
    if row['Violation County'] not in CONVERT_BOROUGH:
        return -1
    return str(stc1 + CONVERT_BOROUGH[row['Violation County']] * 100000)

def addr(row):
    addr_str = str(row['House Number'])
    try:
        if addr_str.find('-') >= 0:
            above, below = addr_str.split('-')
            addr = int(above) * 1000 + int(below)
        elif addr_str == 'nan':
            return -1
        else:
            addr = int(addr_str)
    except ValueError:
        # TODO: try to clean up non-numeric addresses
        return -1
    return addr

parking_violations['StreetCode'] = parking_violations.apply(get_full_street_code, axis=1)
parking_violations['Addr'] = parking_violations.apply(addr, axis=1)
parking_violations['Even'] = (parking_violations['Addr'] % 2 == 0)

CPU times: user 1min 20s, sys: 3.9 s, total: 1min 24s
Wall time: 1min 27s


In [189]:
%%time
parking_violations_cleaned = parking_violations[
    (parking_violations['Addr'] > -1) &
    parking_violations['StreetCode'].notna()].copy().sort_values(
    ['Addr', 'StreetCode', 'Even'])

CPU times: user 1.55 s, sys: 533 ms, total: 2.08 s
Wall time: 2.11 s


In [190]:
%%time
pv_merged = pd.merge_asof(
    parking_violations_cleaned,
    lion_split, 
    by=['StreetCode', 'Even'], 
    right_on='From', left_on='Addr')
pv_merged = pv_merged[pv_merged['Addr'] <= pv_merged['To']]

CPU times: user 5.93 s, sys: 2.61 s, total: 8.54 s
Wall time: 8.66 s


In [191]:
%%time
def find_latlng(row):
    if row['To'] == row['From']:
        fraction = 0
    else:
        fraction = (row['Addr'] - row['From']) / (row['To'] - row['From'])
    coords = lion_geom[row['SegmentID']].interpolate(fraction, True).coords[0]
    geom = transformer.transform(coords[0], coords[1])
    return '%0.6f,%0.6f' % geom

pv_merged['LatLng'] = pv_merged.apply(find_latlng, axis=1)
pv_merged[['Lat', 'Lng']] = pv_merged['LatLng'].str.split(',', n=1, expand=True)
pv_merged = pv_merged.drop('LatLng', axis=1)

# list(lion_df.columns)
# will join dataset of parking violations columns Registration State, 

CPU times: user 2min 13s, sys: 5.4 s, total: 2min 18s
Wall time: 2min 20s


In [192]:
pv_merged.to_csv('latlng.csv')

In [193]:
len(pv_merged)

704366

In [196]:
pv_merged['Days Parking In Effect'].unique()

array(['Y', 'YYYYYYB', 'BBBBBBB', 'YBBBBBB', 'YYYYYYY', 'YYYYY',
       'BYBBYBB', 'Y    YY', 'Y  Y', 'YYYBYBB', 'YYYYYY', 'YYYYYBB',
       'Y     Y', 'YYYYBBB', 'Y Y Y', 'BBBBYYY', 'BYBBBBB', 'YBBYBBB',
       'BBBYBBB', 'BYYBYBB', 'BBYBBBB', 'BBBBYBB', 'YBYBYBB', 'Y YYYYY',
       'BBBBBYY', 'BBBBBBY', 'BBBYBBY', 'Y   Y', 'BYYYYBB', 'Y Y',
       'BBBYYBB', 'BYBYBYB', 'YYYYBYY', 'BYBYBBB', 'YY YYYY', 'YBYYBBB',
       'YYYY', 'YYBBYBB', 'YYBYYBB', 'BYBYYBB', 'YBBYYBB', 'YY YY',
       'BYBBYBY', 'YBYYYYY', 'BBYYBBB', 'YY Y', 'BBBBYBY', 'BYBYBYY',
       'YYBYBBB', 'BYYYYYB', 'YBBBYBB', 'YBYYYBB', 'BYBBYYB', 'Y YY',
       'YYYBYYB', 'YBYBYBY', 'BYYBBBB', 'YYY YYY', 'YBBYBYB', 'YYBYYYY',
       'BYBYYYB', 'BBBBBYB', 'YBBYBYY', 'BBYBYBB', 'BBBBYYB', 'YYYBBYY',
       'YYBYYYB', 'Y Y  Y', 'YY YYY', 'BYYBYYY', 'YYBBBBB', 'YYYYYBY',
       'YYY Y', 'YYYYBYB', 'Y  Y Y', 'YBYYYYB', 'YY', 'YBYYBYB', 'Y    Y',
       'YYYYBBY', 'YBYBYYB', 'YBBBYYB', 'BBBYBYB', 'Y  YY', 'BYYBYYB',
       'BBB

In [194]:
len(parking_violations[parking_violations['Street Code1'] > 0])

851795

In [197]:
len(pv_merged)/len(parking_violations[parking_violations['Street Code1'] > 0])

0.8269196226791657

In [202]:
parking_violations.groupby(['Violation Code']).count()['Summons Number'].sort_values(ascending=False).head(20)

Violation Code
21    209842
38     99582
14     88145
20     75111
46     56994
36     50412
71     49747
40     45793
37     40538
7      40020
70     28604
19     27436
16     17409
69     17156
31     13096
5      12220
74      9491
47      9425
84      9102
50      7518
Name: Summons Number, dtype: int64