# Assignment 3

Import libraries and define common helper functions

In [2]:
import os
import sys
import gzip
import json
from pathlib import Path
import csv

import pandas as pd
import s3fs
import pyarrow as pa
#  from pyarrow.json import read_json
import pyarrow.parquet as pq
import fastavro
import pygeohash
import snappy
import jsonschema
from jsonschema.exceptions import ValidationError


endpoint_url='https://storage.budsc.midwest-datascience.com'

current_dir = Path(os.getcwd()).absolute()
schema_dir = current_dir.joinpath('schemas')
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)


def read_jsonl_data():
    # s3 = s3fs.S3FileSystem(
    #     anon=True,
    #     client_kwargs={
    #         'endpoint_url': endpoint_url
    #     }
    # )
    # src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    # with s3.open(src_data_path, 'rb') as f_gz:
    #     with gzip.open(f_gz, 'rb') as f:
    #         records = [json.loads(line) for line in f.readlines()]

    src_data_path = '../../../data/processed/openflights/routes.jsonl.gz'
    with gzip.open(src_data_path, 'rb') as f:
        records = [json.loads(line) for line in f.readlines()]
        

    return records

Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz 

In [3]:
records = read_jsonl_data()

In [4]:
type(records)

list

## 3.1

### 3.1.a JSON Schema

In [18]:
def validate_jsonl_data(records):
    schema_path = schema_dir.joinpath('routes-schema.json')
    with open(schema_path) as f:
        # schema = jsonschema.Draft7Validator(records[0]).schema
        # f.write(json.dumps(schema, indent=4))
        schema = json.load(f)
    validation_csv_path = ('validation.md')
    with open(validation_csv_path, 'w') as f:    
        for i, record in enumerate(records):
            try:
                jsonschema.validate(record, schema)
            except ValidationError as e:
                f.write(f"Failed Entry: {i}\n")
            

validate_jsonl_data(records)

### 3.1.b Avro

In [None]:
def create_avro_dataset(records):
    schema_path = schema_dir.joinpath('routes.avsc')
    data_path = results_dir.joinpath('routes.avro')
    ## TODO: Use fastavro to create Avro dataset
    # schema = json.load(schema_path)
    parsed_schema = fastavro.schema.load_schema(schema_path)
    with open(data_path, 'wb') as out:
        fastavro.writer(out, parsed_schema, records)

    # used to test file output
    # with open(data_path, 'rb') as fo:
    #     avro_reader = fastavro.reader(fo)
    #     for record in avro_reader:
    #         print(record)
        
create_avro_dataset(records)

### 3.1.c Parquet

In [20]:
def create_parquet_dataset():
    src_data_path = '../../../data/processed/openflights/routes.jsonl.gz'
    parquet_output_path = results_dir.joinpath('routes.parquet')
    # s3 = s3fs.S3FileSystem(
    #     anon=True,
    #     client_kwargs={
    #         'endpoint_url': endpoint_url
    #     }
    # )
    
    with open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            # pass
            ## TODO: Use Apache Arrow to create Parquet table and save the dataset
            record_data = pa.array([json.loads(line) for line in f.readlines()])
    table = pa.Table.from_arrays([record_data], names=['Flight Info'])
    pq.write_table(table, parquet_output_path)

create_parquet_dataset()

### 3.1.d Protocol Buffers

In [45]:
sys.path.insert(0, os.path.abspath('routes_pb2'))

import routes_pb2

def _airport_to_proto_obj(airport):
    obj = routes_pb2.Airport()
    if airport is None:
        return None
    if airport.get('airport_id') is None:
        return None

    obj.airport_id = airport.get('airport_id')
    if airport.get('name'):
        obj.name = airport.get('name')
    if airport.get('city'):
        obj.city = airport.get('city')
    if airport.get('iata'):
        obj.iata = airport.get('iata')
    if airport.get('icao'):
        obj.icao = airport.get('icao')
    if airport.get('altitude'):
        obj.altitude = airport.get('altitude')
    if airport.get('timezone'):
        obj.timezone = airport.get('timezone')
    if airport.get('dst'):
        obj.dst = airport.get('dst')
    if airport.get('tz_id'):
        obj.tz_id = airport.get('tz_id')
    if airport.get('type'):
        obj.type = airport.get('type')
    if airport.get('source'):
        obj.source = airport.get('source')

    obj.latitude = airport.get('latitude')
    obj.longitude = airport.get('longitude')

    return obj


def _airline_to_proto_obj(airline):
    obj = routes_pb2.Airline()
    ## TODO: Create an Airline obj using Protocol Buffers API
    if airline is None:
        return None
    if airline.get('airline_id') is None:
        return None
    
    obj.airline_id = airline.get('airline_id')
    if airline.get('name'):
        obj.name = airline.get('name')
    if airline.get('city'):
        obj.city = airline.get('alias')
    if airline.get('iata'):
        obj.iata = airline.get('iata')
    if airline.get('icao'):
        obj.icao = airline.get('icao')
    if airline.get('altitude'):
        obj.altitude = airline.get('callsign')
    if airline.get('timezone'):
        obj.timezone = airline.get('country')
    if airline.get('dst'):
        obj.dst = airline.get('active')
    return obj


def create_protobuf_dataset(records):
    routes = routes_pb2.Routes()
    for record in records:
        route = routes_pb2.Route()
        ## TODO: Implement the code to create the Protocol Buffers Dataset
        route.airline.CopyFrom(_airline_to_proto_obj(record["airline"]))
        route.src_airport.CopyFrom(_airport_to_proto_obj(record["src_airport"]))
        route.dst_airport.CopyFrom(_airport_to_proto_obj(record["dst_airport"]))
        if record.get('codeshare'):
            route.codeshare = record["codeshare"]
        if record.get('stops'):
            route.stops = record["stops"]
        if record.get('equipment'):
            route.equipment.append("equipment")



        routes.route.append(route)

    data_path = results_dir.joinpath('routes.pb')

    with open(data_path, 'wb') as f:
        f.write(routes.SerializeToString())
        
    compressed_path = results_dir.joinpath('routes.pb.snappy')
    
    with open(compressed_path, 'wb') as f:
        f.write(snappy.compress(routes.SerializeToString()))
        
create_protobuf_dataset(records)

TypeError: Parameter to MergeFrom() must be instance of same class: expected routes_pb2.Airport got NoneType.

## 3.2

### 3.2.a Simple Geohash Index

In [52]:
def create_hash_dirs(records):
    geoindex_dir = results_dir.joinpath('geoindex')
    geoindex_dir.mkdir(exist_ok=True, parents=True)
    hashes = []
    ## TODO: Create hash index
    for record in records:
        
        print(record["src_airport"]["longitude"])
        print(record["dst_airport"]["longitude"])
       # hashes.append(records)
create_hash_dirs(records)

39.9566
49.278701782227
48.0063018799
49.278701782227
48.0063018799
43.08190155029297
61.5033
49.278701782227
61.5033
82.650703430176
37.90629959106445
49.278701782227
37.90629959106445
52.09249877929688
37.90629959106445


TypeError: 'NoneType' object is not subscriptable

### 3.2.b Simple Search Feature

In [None]:
def airport_search(latitude, longitude):
    ## TODO: Create simple search to return nearest airport
    pass
    
airport_search(41.1499988, -95.91779)