In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
pd.set_option('display.max_rows', 4000)
import numpy as np
import urllib
import json
import requests, zipfile, io
import seaborn as sns
import re 

def get_element(tag,element_string):
    if tag.find(element_string) is None:
        element = 'None'
    else: element = tag.find(element_string).text 
    
    return element

def parking_zones(split_zone_identifiers_list):
    length_element = len(split_zone_identifiers_list)
    
    if length_element > 1:
        list_zone_identifiers = split_zone_identifiers_list
    else:
        list_zone_identifiers = [split_zone_identifiers_list[0],'None']
    return list_zone_identifiers

def str_cleaning_valid_time(string):
    string = string.replace('.','')
    string = string.replace('to','-')
    string = string.replace('\xa0','')
    return string

def str_cleaning_permited_time(string):
    string = string.lstrip().rstrip()
    string = string.replace('.','')
    
    string = string.replace('\xa0','')
    string = string.replace('hours','hour')
  
    string = string.replace(' (delivery vehicles parking zone)','')
    string = string.replace(' (delivery vehicle parking zone)','')
    string = string.replace(' hour','*60')
    string = string.replace(' mins','')
    string = string.replace(' min','')
    string = string.replace('None','0')
    if string.endswith(r'(buses only)') or string.endswith(r'(busesonly)'):
        string = '0'
    return string

def str_cleaning_area_between(string):
    string = string.lstrip().rstrip().lower()
    string = string.replace('\xa0','')
    string = string.replace('a point ','').replace('thereof','')
   
    return string



In [2]:
def import_dataset():

    # Get the dataset metadata by passing package_id to the package_search endpoint
    # For example, to retrieve the metadata for this dataset:

    url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
    params = { "id": "72040958-e532-46f7-9228-8d07b4677a2b"}
    response = urllib.request.urlopen(url, data=bytes(json.dumps(params), encoding="utf-8"))
    package = json.loads(response.read())

    zip_file_url = package['result']['resources'][0]['url']

    r = requests.get(zip_file_url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall()
    
    root = ET.parse('Ch_950_Sch_15_ParkingForRestrictedPeriods.xml').getroot()
    
    raw_id = []
    raw_street = []
    raw_side = []
    raw_between = []
    raw_time = []
    raw_max_time = []

    for tag in root.findall('Ch_950_Sch_15_ParkingForRestrictedPeriods'):
        raw_id.append(get_element(tag,'ID')) 
        raw_street.append(get_element(tag,'Highway')) 
        raw_side.append(get_element(tag,'Side')) 
        raw_between.append(get_element(tag,'Between'))
        raw_time.append(get_element(tag,'Times_and_or_Days')) 
        raw_max_time.append(get_element(tag,'Maximum_Period_Permitted')) 

    import_df = pd.DataFrame({
        'ID':raw_id ,
        'street':raw_street ,
        'park_side':raw_side ,
        'area_between':raw_between ,
        'valid_time':raw_time ,
        'permited_time':raw_max_time 
    })

    np_df = import_df[import_df.park_side.isna()].copy()
    raw_df = import_df[~import_df.park_side.isna()].copy()


    raw_df['permited_time_mins']= raw_df.permited_time.apply(lambda x:str_cleaning_permited_time(x)).copy()
    raw_df.permited_time_mins = raw_df.permited_time_mins.apply(lambda x: eval(x))

    split_zones = raw_df.area_between.apply(lambda x: str_cleaning_area_between(x)).str.split('and')

    raw_df['start_zone'] = split_zones.apply(lambda x: parking_zones(x)[0])
    raw_df['end_zone'] = split_zones.apply(lambda x: parking_zones(x)[1])

    return raw_df

In [3]:
df = import_dataset()

In [4]:
df.head()

Unnamed: 0,ID,street,park_side,area_between,valid_time,permited_time,permited_time_mins,start_zone,end_zone
0,4,Tenth Street,Both,Lake Shore Blvd W and the first lane south the...,"9:00 a.m. to 6:00 p.m., Mon. to Sat.",2 hours,120,lake shore blvd w,the first lane south
1,6,Eleventh Street,Both,Lake Shore Blvd W and the first lane,"9:00 a.m. to 6:00 p.m., Mon. to Sat.",2 hours,120,lake shore blvd w,the first lane
3,11,Thirteenth Street,Both,Lake Shore Boulevard West and the first lane s...,"9:00 a.m. to 6:00 p.m., Mon. to Sat.",2 hours,120,lake shore boulevard west,the first lane south
4,12,Thirteenth Street,East,Lake Shore Boulevard West and the first lane s...,"9:00 a.m. to 6:00 p.m., Mon. to Sat.",,0,lake shore boulevard west,the first lane south
5,14,First Street,East,Lake Shore Boulevard West and a point 26 metre...,"9:00 a.m. to 6:00 p.m., Mon. to Sat.",2 hours,120,lake shore boulevard west,26 metres south


In [5]:
def time_reformat(string):
    for a in set(re.findall(r'(?<!\d)\d:\d\d',string)):
        string = re.sub(a,'0'+a,string)
    return string

def date_reformat(string):
    for a in set(re.findall(r'[A-z][A-z][A-z]\s\d\d|[A-z][A-z][A-z]\s\d',string)):
        fixed_element = re.sub('\s','',a)
        string = re.sub(a,fixed_element,string)
    return string

month_conv_dict = {'Jan' : 'January',
                   'Feb' : 'February',
                   'Mar' :'March',
                   'Apr' :'April',
                   'Aug' :'August',
                   'Sept':'September',
                   'Oct' :'October',
                   'Nov' :'November',
                   'Dec' :'December'}

def month_reformat(string):
    for key, value in month_conv_dict.items():
        for a in set(re.findall(key,string)):
            fixed_element = re.sub(key,value,a)
            string = re.sub(a,fixed_element,string)
    return string


def remove_exception(string):
    a = re.findall(r'(?<=except).*',string)
    if len(a) > 0:
        string = re.sub('except'+a[0],'',string)
   
    return string

def str_cleaning_valid_time(string):
    string = string.replace('\xa0','')
    string = string.replace('(','')
    string = string.replace(')','')
    #string = remove_exception(string)
    string = string.replace('of one day to','-')
    string = string.replace('a.m','am')
    string = string.replace('a.m.','am')
    string = string.replace('p.m','pm')
    string = string.replace('p.m.','pm')
    string = string.replace('pm.','pm.,')
    string = string.replace('first','1st')
    string = string.replace(' each month','')
    string = string.replace(' of the next following year','')
    string = string.replace(' of one year ','')
    string = string.replace(' day of each month ','')
    string = string.replace(' day of','')
    string = string.replace(' day','')
    string = string.replace(', inclusive','')
    string = string.replace('of the next following day','')
    string = string.replace('to','-')
    string = string.replace('the','')
    string = string.replace(';',',')
    string = string.replace(' anytime, Sat. and Sun.','')
    string = string.replace(' anytime Sat., Sun. and public holidays','')
    string = string.replace('; anytime, Sat, Sun and public holidays','')
    string = string.replace('Thurs.','Thurs.,')
    string = string.replace('From','from')
    string = string.replace(' am. Mon.',' am., Mon.') 
    string = string.replace('Mon - Fri.', 'Mon. - Fri.')
    string = string.replace('Mon.- Fri.', 'Mon. - Fri.')
    string = string.replace('Mon.-Fri.', 'Mon. - Fri.')
    string = string.replace('Monday - Friday', 'Mon. - Fri.')
    string = string.replace('Mon. To Fri.', 'Mon. - Fri.')
    string = string.replace('Monday- Friday','Mon. - Fri.')
    string = string.replace(' - ','-')
    string = string.replace('- ','-')
    string = string.replace('-','-')
    string = string.replace('.','')
    string = string.replace(',,',',')
    string = string.replace(',-','-')
    string = string.replace('Sat, Sun', 'Sat-Sun')
    string = string.replace('Sat and Sun', 'Sat-Sun')
    string = string.replace('Sat and Sun', 'Sat-Sun')
    string = string.replace('Monday-Friday','Mon-Fri')
    string = time_reformat(string)
    string = date_reformat(string)
    string = month_reformat(string)
    return string

anytime_weekend = []
anytime_check = np.unique(df.valid_time.str.extractall(r'(anytime.*)').values)

for a in df.valid_time:
    if re.findall(r'(anytime.*)', a) in anytime_check:
        anytime_weekend.append(1) 
    else: anytime_weekend.append(0)
        
exceptions = []
for a in df.valid_time:
    if a.find('except') > -1 or a.find('Except') > -1:
        exceptions.append(1) 
    else: exceptions.append(0)



In [6]:
df['clean'] = df.valid_time.apply(lambda x: str_cleaning_valid_time(x))

time_df = df.copy()
time_df['ss_anytime'] = anytime_weekend
time_df['exceptions'] = exceptions

In [7]:
regex = re.compile(r'''(?P<time>\d\d:\d\d\s[a-z][a-z]-\d\d:\d\d\s[a-z][a-z])
|(?P<days>[A-z]{3}-[A-z]{3})
|(?P<date>[A-z]{5,9}\d{1,2}-[A-z]{5,9}\d{1,2})
|(?P<day_range>\d{1,2}[A-z][A-z]-\d{1,2}[A-z][A-z]|\d{1,2}[A-z][A-z]-[A-z]{4})''',re.X)

In [8]:
df.clean.str.extractall(regex).loc[[372]]

Unnamed: 0_level_0,Unnamed: 1_level_0,time,days,date,day_range
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
372,0,07:00 am-12:00 am,,,
372,1,,,,1st-15th
372,2,,,April1-November30,
372,3,07:00 am-12:00 am,,,
372,4,,,December1-March31,


In [9]:
#extracts time
df.clean.str.extractall(r'''(?P<time>\d\d:\d\d [a-z][a-z]-\d\d:\d\d [a-z][a-z])''').loc[[480]]

#extracts RANGE OF DAYS where parking is valid given dates in a month
df.clean.str.extractall(r'''(?P<day_range>\d{1,2}[A-z][A-z]-\d{1,2}[A-z][A-z]|\d{1,2}[A-z][A-z]-[A-z]{4})''').loc[[3143]]

#extracts month and date
df.clean.str.extractall(r'''([A-z]{5,9}\d{1,2}-[A-z]{5,9}\d{1,2})''').loc[[2386]]

#extracts day
df.clean.str.extractall(r'''([A-z][A-z][A-z]-[A-z][A-z][A-z]|[A-z][A-z][A-z]\d{1,2}-[A-z][A-z][A-z]\d{1,2})''').loc[[2386]]

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
2386,0,Mon-Fri
2386,1,Sat-Sun


In [10]:
def extract_timing(string):
    time_regex = r'''(?P<time>\d\d:\d\d [a-z][a-z]-\d\d:\d\d [a-z][a-z])'''
    day_regex = r'''([A-z][A-z][A-z]-[A-z][A-z][A-z]|[A-z][A-z][A-z]\d{1,2}-[A-z][A-z][A-z]\d{1,2})'''
    
    

In [56]:
string = list(df.loc[[2386]].clean)[0]

found_time = re.finditer(r'''(?P<time>\d\d:\d\d [a-z][a-z]-\d\d:\d\d [a-z][a-z])''',string)
found_day = re.finditer(r'''([A-z][A-z][A-z]-[A-z][A-z][A-z]|[A-z][A-z][A-z]\d{1,2}-[A-z][A-z][A-z]\d{1,2})''',string)
found_date = re.finditer(r'''([A-z]{5,9}\d{1,2}-[A-z]{5,9}\d{1,2})''',string)
found_day_range = re.finditer(r'''(?P<day_range>\d{1,2}[A-z][A-z]-\d{1,2}[A-z][A-z]|\d{1,2}[A-z][A-z]-[A-z]{4})''',string)

for a in found_time:
    element_start = a.span()[0]
    element_end = a.span()[1]
    relevant_regex = []
    

10:00 am-04:00 pm
10:00 am-06:00 pm


In [46]:
print(string)

10:00 am-04:00 pm, Mon-Fri, except public holidays, and10:00 am-06:00 pm, Sat-Sun and public holidays


'10:00 am-04:00 pm, Mon-Fri, except public holidays, and10:00 am-06:00 pm, Sat-Sun and public holidays'