In [2]:
import os
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
%matplotlib inline

In [3]:
from pyhive import hive

# Set python variables from environment variables
username = os.environ['RENKU_USERNAME']
hive_host = os.environ['HIVE_SERVER2'].split(':')[0]
hive_port = os.environ['HIVE_SERVER2'].split(':')[1]

# create connection
conn = hive.connect(host=hive_host,
                    port=hive_port,
                    username=username) 
# create cursor
cur = conn.cursor()

In [8]:
query = """
    drop table if exists {0}.routes
""".format(username)
cur.execute(query)
  
# `ROUTE_ID`: identifier for the route (PK)
# `AGENCY_ID`: identifier of the operator (FK)
# `ROUTE_SHORT_NAME`: the short name of the route, usually a line number
# `ROUTE_LONG_NAME`: (empty)
# `ROUTE_DESC`: _Bus_, _Zub_, _Tram_, etc.
# `ROUTE_TYPE`:
    
query = """
    create external table {0}.routes(
        ROUTE_ID string,
        AGENCY_ID string,
        ROUTE_SHORT_NAME string,
        ROUTE_LONG_NAME string,
        ROUTE_DESC string,
        ROUTE_TYPE string
    )
    stored as orc
    location '/data/sbb/part_orc/routes'
""".format(username)
cur.execute(query)

In [9]:
query = """
    select * from {0}.routes limit 5
""".format(username)
df = pd.read_sql(query, conn)

In [10]:
df

Unnamed: 0,routes.route_id,routes.agency_id,routes.route_short_name,routes.route_long_name,routes.route_desc,routes.route_type
0,11-61-j18-1,7031,61,,Bus,700
1,11-62-j18-1,7031,62,,Bus,700
2,24-64-j18-1,801,64,,Bus,700
3,24-65-j18-1,801,65,,Bus,700
4,24-66-j18-1,801,66,,Bus,700


In [12]:
query = """
    select count(*) from {0}.routes
""".format(username)
df = pd.read_sql(query, conn)

In [13]:
df

Unnamed: 0,_c0
0,784333


In [4]:
query = """
    drop table if exists {0}.trips
""".format(username)
cur.execute(query)

# ROUTE_ID: identifier (FK) for the route. A route is a sequence of stops. It is time independent.
# SERVICE_ID: identifier (FK) of a group of trips in the calendar, and for managing exceptions (e.g. holidays, etc).
# TRIP_ID: is one instance (PK) of a vehicle journey on a given route - the same route can have many trips at regular intervals; a trip may skip some of the route stops.
# TRIP_HEADSIGN: displayed to passengers, most of the time this is the (short) name of the last stop.
# TRIP_SHORT_NAME: internal identifier for the tripheadsign (note TRIPHEADSIGN and TRIPSHORTNAME are only unique for an agency)
# DIRECTION_ID: if the route is bidirectional, this field indicates the direction of the trip on the route.
    
query = """
    create external table {0}.trips(
        ROUTE_ID string,
        SERVICE_ID string,
        TRIP_ID string,
        TRIP_HEADSIGN string,
        TRIP_SHORT_NAME string,
        DIRECTION_ID string
    )
    stored as orc
    location '/data/sbb/part_orc/trips'
""".format(username)
cur.execute(query)

In [5]:
query = """
    select count(*) from {0}.trips
""".format(username)
df = pd.read_sql(query, conn)

In [6]:
df

Unnamed: 0,_c0
0,153793913
