### Load data from XML

In [1]:
import bz2
import xml.etree.ElementTree as xml
import pandas as pd

In [2]:
# Data conversions
conversion = [
    ('vendor', int),
    ('people', int),
    ('tip', float),
    ('price', float),
    ('pickup', pd.to_datetime),
    ('dropoff', pd.to_datetime),
    ('distance', float),
]

In [44]:
def iter_rides(file_name):
    with bz2.open(file_name, 'rt') as fp:
        tree = xml.parse(fp)

    rides = tree.getroot()
    
    for elem in rides:
        record = {}
        for tag, func in conversion:
            text = elem.find(tag).text
            record[tag] = func(text)
        yield record

In [45]:
def load_xml(file_name):
    records = iter_rides(file_name)
    return pd.DataFrame.from_records(records)

In [48]:
# Example
if __name__ == '__main__':
    df = load_xml('taxi.xml.bz2')
    print(df.dtypes)

vendor               int64
people               int64
tip                float64
price              float64
pickup      datetime64[ns]
dropoff     datetime64[ns]
distance           float64
dtype: object


In [49]:
df.head()

Unnamed: 0,vendor,people,tip,price,pickup,dropoff,distance
0,2,1,4.74,20.54,2018-10-31 07:10:55,2018-11-01 06:43:24,2.57
1,2,5,0.0,13.8,2018-10-31 16:38:25,2018-10-31 16:50:10,3.58
2,2,1,1.0,11.3,2018-10-31 20:23:41,2018-10-31 20:31:47,2.39
3,2,1,0.0,5.8,2018-10-31 22:44:24,2018-10-31 22:48:28,0.5
4,2,1,2.26,13.56,2018-10-31 23:22:18,2018-10-31 23:35:30,1.81
