In [1]:
import psycopg2

def run_query(query: str = ''):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        conn = psycopg2.connect(
            host="localhost",
            port='25432',
            database="gis",
            user="docker",
            password="docker"
        )

        # create a cursor
        cur = conn.cursor()

        # execute a statement
        cur.execute(query)
        data = cur.fetchall()

        # close the communication with the PostgreSQL
        cur.close()
        return data
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        if conn is not None:
            conn.close()

In [2]:
query = """select trip_id from trip;"""
trip_id_list = [id[0] for id in run_query(query)]
# trip_id_list

In [3]:
test = '1445.1.60-27-d12-1.149.O'
test = test.replace('-d12-', '-b12-', 1).replace('-ga2-', '-gad-', 1)
test

'1445.1.60-27-b12-1.149.O'

In [4]:
all([len(i.split('.')) == 5 for i in trip_id_list]) == True

# we can now assert that every trip in our database be split by 
# a . 5 times, this means that there is a consistent structure across all our data

True

In [5]:
test in trip_id_list

False

In [6]:
test.split('.')

['1445', '1', '60-27-b12-1', '149', 'O']

In [7]:
'.'.join(test.split('.'))
str.join('.',test.split('.'))

'1445.1.60-27-b12-1.149.O'

In [8]:
tokens = test.split('.')
tokens[3] = '*'
new_str = '.'.join(tokens)
new_str

'1445.1.60-27-b12-1.*.O'

In [9]:
new_str in trip_id_list

False

In [10]:
import re

r = re.compile(new_str)
matched_list = list(filter(r.match, trip_id_list))
matched_list


['1445.1.60-27-b12-1.151.O']

In [11]:
found_str = None
if len(matched_list) == 1:
    found_str = matched_list[0]

found_str

'1445.1.60-27-b12-1.151.O'

In [48]:
def find_trip_regex(trip_list, trip_id):
    assert type(trip_id) == str, 'trip_id must be string'

    tokens = trip_id.split('.')
    assert len(tokens) == 5, 'unusual trip_id, please investigate: {}'.format(trip_id)

    route_id = tokens[2].split('-')
    if route_id[2] in ['ga2', 'gad']:
        route_id[2] = 'ga[2|d]'
        tokens[2] = '-'.join(route_id)
    elif route_id[2] in ['d12', 'b12']: 
        route_id[2] = '[b|d]12'
        tokens[2] = '-'.join(route_id)

    tokens[3] = '*'

    reg = '.'.join(tokens)
    print(reg)

    r = re.compile(reg)
    matched_list = list(filter(r.match, trip_list))

    if len(matched_list) > 0:
        # print(matched_list, trip_id)
        return matched_list[0]
    else: return None

find_trip_regex(trip_id_list, '3368.1.60-46A-d12-1.248.I')

3368.1.60-46A-[b|d]12-1.*.I


'3368.1.60-46A-b12-1.253.I'

In [13]:
for i in trip_id_list:
    id = i.split('.')[2].split('-')[2]
    # if not id in ['gad', 'b12', 'd12']: 
    #     print(i)