In [94]:
import csv
from collections import Counter, defaultdict
from tqdm import tqdm
import json
import numpy as np
import time
import seaborn as sns
import pandas as pd
import requests
from openclean.pipeline import stream

import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util,CrossEncoder
from scipy.stats import spearmanr
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, ndcg_score
from sklearn.feature_extraction.text import TfidfVectorizer

# NYC Parking Violation

### Datasets and Data Clean

In [107]:

violation_data_url = "https://data.cityofnewyork.us/resource/kvfd-bves.json"
params = {'$where': "issue_date <= '2023-12-31T23:59:59'", '$limit': 1000000}
response = requests.get(violation_data_url, params=params)
violation_data = response.json()
df_violation = pd.DataFrame(violation_data)


violation_codes_url = 'https://data.cityofnewyork.us/resource/ncbg-6agr.json'
response = requests.get(violation_codes_url)
violation_codes_data = response.json()
df_violation_codes = pd.DataFrame(violation_codes_data)

df_violation['violation_code'] = df_violation['violation_code'].astype(str)
df_violation_codes['code'] = df_violation_codes['code'].astype(str)

df1 = pd.merge(df_violation, df_violation_codes[['code', 'definition']], left_on='violation_code', right_on='code', how='left')
df1 = df1.drop(columns='code')
df1 = df1.rename(columns={'definition': 'code_definition'})


In [108]:
COLUMNS = [
    'summons_number',
    'plate_id',
    'registration_state',
    'plate_type',
    'issue_date',
    'violation_code',
    'vehicle_body_type',
    'vehicle_make',
    'issuing_agency',
    'street_name',
    'intersecting_street',
    'vehicle_expiration_date',
    'violation_time',
    'violation_county',
    'violation_description',
    'code_definition'
]

df = df1[COLUMNS]

In [109]:
df.duplicated().sum()

0

In [110]:
df.columns

Index(['summons_number', 'plate_id', 'registration_state', 'plate_type',
       'issue_date', 'violation_code', 'vehicle_body_type', 'vehicle_make',
       'issuing_agency', 'street_name', 'intersecting_street',
       'vehicle_expiration_date', 'violation_time', 'violation_county',
       'violation_description', 'code_definition'],
      dtype='object')

In [111]:
df.isna().sum()

summons_number                  0
plate_id                        0
registration_state              0
plate_type                      0
issue_date                      0
violation_code                  0
vehicle_body_type            2706
vehicle_make                 2006
issuing_agency                  0
street_name                   111
intersecting_street        404751
vehicle_expiration_date         0
violation_time                 11
violation_county             1848
violation_description      462655
code_definition               181
dtype: int64

In [112]:
def convert_date(date_str):
    if date_str in ["88880088", "0"]:
        return np.nan
    else:
        return pd.to_datetime(date_str, format='%Y%m%d', errors='coerce')

def convert_time(time_str):
    if not isinstance(time_str, str):
        return time_str
    if time_str[-1] == 'P':
        hour = int(time_str[:2])
        if hour < 12:
            hour += 12
        return f"{hour:02d}:{time_str[2:4]}"
    elif time_str[-1] == 'A':
        hour = int(time_str[:2])
        return f"{hour:02d}:{time_str[2:4]}"
    else:
        return time_str

df['issue_date'] = pd.to_datetime(df['issue_date']).dt.date
df['vehicle_expiration_date'] = df['vehicle_expiration_date'].apply(convert_date)
df['violation_time'] = df['violation_time'].apply(convert_time)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['issue_date'] = pd.to_datetime(df['issue_date']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['vehicle_expiration_date'] = df['vehicle_expiration_date'].apply(convert_date)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['violation_time'] = df['violation_time'].apply(convert_time)

In [113]:
df


Unnamed: 0,summons_number,plate_id,registration_state,plate_type,issue_date,violation_code,vehicle_body_type,vehicle_make,issuing_agency,street_name,intersecting_street,vehicle_expiration_date,violation_time,violation_county,violation_description,code_definition
0,1471497410,HZH8177,NY,PAS,2020-07-02,20,SUBN,NISSA,P,SHORE FRONT PKWY,,2022-05-10,12:59,Q,,NO PARKING-DAY/TIME LIMITS
1,1471497630,JCX5781,NY,PAS,2020-06-27,20,P-U,DODGE,P,ROCKAWAY BEACH BLVD,,2020-09-09,09:40,Q,,NO PARKING-DAY/TIME LIMITS
2,1471497641,HEK2391,NY,PAS,2020-06-27,20,SUBN,KIA,P,ROCKAWAY BEACH BLVD,,2022-03-06,09:40,Q,,NO PARKING-DAY/TIME LIMITS
3,1471497653,GWY9859,NY,PAS,2020-06-27,20,SUBN,JEEP,P,ROCKAWAY BEACH BLVD,,2021-05-06,09:40,Q,,NO PARKING-DAY/TIME LIMITS
4,1471497665,HEZ5501,NY,PAS,2020-06-27,20,SUBN,SUBAR,P,ROCKAWAY BEACH BLVD,,2022-05-24,09:40,Q,,NO PARKING-DAY/TIME LIMITS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,4700723865,HSC7218,NY,PAS,2020-07-28,36,4DSD,ACURA,V,EB NEPTUNE AVE @ W 5,TH ST,NaT,15:06,BK,PHTO SCHOOL ZN SPEED VIOLATION,PHTO SCHOOL ZN SPEED VIOLATION
999996,4700723889,HVN4751,NY,PAS,2020-07-28,36,SUBN,TOYOT,V,EB UNION TPKE @ KENT,ST,NaT,15:06,QN,PHTO SCHOOL ZN SPEED VIOLATION,PHTO SCHOOL ZN SPEED VIOLATION
999997,4700723890,JEC5912,NY,PAS,2020-07-28,36,4DSD,HONDA,V,SB BELL BLVD @ 18TH,AVE,NaT,15:06,QN,PHTO SCHOOL ZN SPEED VIOLATION,PHTO SCHOOL ZN SPEED VIOLATION
999998,4700723932,JRA7084,NY,PAS,2020-07-28,36,4DSD,ME/BE,V,NB KISSENA BLVD @ ME,LBOURNE AVE,NaT,15:06,QN,PHTO SCHOOL ZN SPEED VIOLATION,PHTO SCHOOL ZN SPEED VIOLATION
