In [10]:
import bz2
import csv
import json
from collections import namedtuple
from datetime import datetime
from pprint import pprint

In [2]:
Column = namedtuple('Column', 'src dest convert')

In [3]:
def parse_timestamp(text):
    return datetime.strptime(text, '%Y-%m-%d %H:%M:%S')

In [4]:
columns = [
    Column('VendorID', 'vendor_id', int),
    Column('passenger_count', 'num_passengers', int),
    Column('tip_amount', 'tip', float),
    Column('total_amount', 'price', float),
    Column('tpep_dropoff_datetime', 'dropoff_time', parse_timestamp),
    Column('tpep_pickup_datetime', 'pickup_time', parse_timestamp),
    Column('trip_distance', 'distance', float),
]

In [5]:
def iter_records(file_name):
    with bz2.open(file_name, 'rt') as fp:
        reader = csv.DictReader(fp)
        for csv_record in reader:
            record = {}
            for col in columns:
                record[col.dest] = col.convert(csv_record[col.src])
            yield record

In [8]:
def example():

    for i, record in enumerate(iter_records('taxi.csv.bz2')):
        if i >= 1:
            break
        pprint(record)
example()

{'distance': 2.57,
 'dropoff_time': datetime.datetime(2018, 11, 1, 6, 43, 24),
 'num_passengers': 1,
 'pickup_time': datetime.datetime(2018, 10, 31, 7, 10, 55),
 'price': 20.54,
 'tip': 4.74,
 'vendor_id': 2}


In [9]:
def encode_time(obj):
    if not isinstance(obj, datetime):
        return obj
    return obj.isoformat()

In [12]:
with open('taxi.jl', 'w') as out:
    for record in iter_records('taxi.csv.bz2'):
        data = json.dumps(record, default=encode_time)
        out.write(f'{data}\n')