# Pandas and Arkouda

read csv into pandas and then look at it then put some data frame columns into arkouda

New York City Taxi Data
----------------------------------
Yellow Trips Data Dictionary https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

NYC Yellow Taxi Trip Records Jan 2020 https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv

Green Trips Data Dictionary https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf

NYC Green  Taxi Trip Records Jan 2020 https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2020-01.csv

NYC Taxi Zone Lookup Table https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

NYC Taxi Zone Shapefile https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip

In [None]:
import arkouda as ak
ak.connect(connect_url="tcp://localhost:5555")

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [None]:
# conversion from csv field to int64
# try to convert to int, on exception (empty or other string) convert to 0
def cvt_to_int64(v):
    try:
        return np.int64(v)
    except:
        return np.int64(0)  

In [None]:
# conversion from csv field to string
# try to convert to int, on exception (empty or other string) convert to 0
def cvt_to_string(v):
    try:
        if v == '':
            return 'N/A'
        else:
            return str(v)
    except:
        return 'N/A'

In [None]:
# conversion from csv field (Y,N,empty) to bool
# on Y convert to True, on N or empty convert to False
def cvt_YN_to_bool(v):
    if v == 'Y':
        return True
    else:
        return False

In [None]:
# check all objects in iterable for instance of str
def is_all_str(a):
    ret = True
    for v in a:
        if isinstance(v, str):
            ret = True
        else:
            ret = False
            break
    return ret

# put data frame columns into arkouda server and return a dict of the pdarrays
# convert some columns into data types the server can understand
def ak_create_akdict_from_df(df):
    akdict = {}
    for cname in df.keys():
        a = df[cname].values
            
        # int64, float64, and np.bool should go over fine
        if a.dtype in [np.int64, np.float64, np.bool]:
            akdict[cname] = ak.array(a)
            print(cname, " : ", a.dtype, "->", a.dtype)
        # time needs to be converted to int64
        elif a.dtype in ["datetime64[ns]"]:
            akdict[cname] = ak.array(a.astype(np.int64))
            print(cname, " : ", a.dtype, "->", akdict[cname].dtype)
        # string data?
        elif is_all_str(a):
            akdict[cname] = ak.array(list(a))
            print(cname, " : ", a.dtype, "->", 'ak.Strings')
        # something I don't understand how to convert to a server data type
        else:
            print("don't know how to convert ", a.dtype.kind, " !!!")
    return akdict

## Yellow taxi trip data

In [None]:
# Read in yellow taxi data
# per yellow data dictionary convert to data types Arkouda can handle
# int64, float64, bool
cvt = {'VendorID': cvt_to_int64, 'passenger_count': cvt_to_int64, 'RatecodeID': cvt_to_int64,
       'store_and_fwd_flag': cvt_YN_to_bool,
       'PULocationID': cvt_to_int64, 'DOLocationID':cvt_to_int64, 'payment_type': cvt_to_int64}
# explicitly parse date-time fields
parse_dates_lst = ['tpep_pickup_datetime','tpep_dropoff_datetime']
# call read_csv to parse data with these options
ydf = pd.read_csv("../Downloads/yellow_tripdata_2020-01.csv",
                  converters=cvt, header=0, low_memory=False,
                  parse_dates=parse_dates_lst, infer_datetime_format=True)

In [None]:
#print out dataframe
ydf

In [None]:
# see which keys we read in from first line of csv data file
#print(ydf.keys())
print(ydf.columns)

In [None]:
# read the taxi-zone-lookup-table
cvt = {'Borough':cvt_to_string, 'Zone':cvt_to_string, 'service_zone':cvt_to_string}
tzlut = pd.read_csv("../Downloads/taxi+_zone_lookup.csv",converters=cvt)
# print out the tzlut which was read from file
print(tzlut)
print(tzlut['Borough'].values.dtype)
print(is_all_str(tzlut['Borough'].values))
print(is_all_str(tzlut['Zone'].values))
print(is_all_str(tzlut['service_zone'].values))

# location id is 1-based, index is 0-based
# fix it up to be aligned with index in data frame
# which means add row zero
top_row = pd.DataFrame({'LocationID': [0], 'Borough': ['N/A'], 'Zone': ['N/A'], 'service_zone': ['N/A']})
tzlut = pd.concat([top_row, tzlut]).reset_index(drop = True)
# print fixed up tzlut
print(tzlut)

In [None]:
# take delta for ride duration
ride_duration = ydf['tpep_dropoff_datetime'] - ydf['tpep_pickup_datetime']
# pull out ride duration in minutes
ride_duration = ride_duration.dt.seconds / 60 # in minutes
print("min = ", ride_duration.min(),"max = ", ride_duration.max())
print("mean = ",ride_duration.mean(),"stdev = ",ride_duration.std(),"median =",ride_duration.median())
# how long was the maximum ride to the next integer minute
max_ride = math.ceil(ride_duration.max())
print("max_ride = ", max_ride)

# histogram the ride time bin by the minute
nBins = max_ride
cnts,bin_edges = np.histogram(ride_duration, bins=nBins)
print(cnts.size,     "cnts      = ", cnts)
print(bin_edges.size,"bin edges = ", bin_edges)

In [None]:
# plot the histogram the ride time, bin by the minute
plt.hist(ride_duration,bins=nBins)
plt.yscale('log')
plt.xscale('linear')
plt.show()

In [None]:
print(ydf['trip_distance'].min(), ydf['trip_distance'].max())
print(ydf['trip_distance'].mean(), ydf['trip_distance'].std(), ydf['trip_distance'].median())

plt.figure(figsize=(8,6))
plt.hist(ydf['trip_distance'],bins=2000)
#ax = plt.gca()
#ax.set_xlim((ydf['trip_distance'].min(),ydf['trip_distance'].max()))
plt.yscale('log')
plt.xscale('log')
plt.show()

In [None]:
# data in a pandas series is heald in the values field
# the astype() method converts to desired data type in this cas np.int64
ydf['tpep_pickup_datetime'].astype(np.int64).values

In [None]:
# the index for the series is held in the index field
ydf['tpep_pickup_datetime'].index

In [None]:
# put data frame columns into arkouda server
# convert some columns into data types the server can understand
akdict = ak_create_akdict_from_df(ydf)

In [None]:
# convert data frame with strings and int64 data
aktzlut = ak_create_akdict_from_df(tzlut)

In [None]:
print(aktzlut)

In [None]:
# print out the arkouda server symbol table
print(ak.info(ak.AllSymbols))

In [None]:
# which keys made it over to the server
akdict.keys()

In [None]:
# how many records made it to the server?
numTotal = akdict['tpep_pickup_datetime'].size

# use the store_and_forward column to index tpep_pickup_datetime
# see how many time was false
numFalse = akdict['tpep_pickup_datetime'][~akdict['store_and_fwd_flag']].size

# use the store_and_forward column to index tpep_pickup_datetime
# see how many time was true
numTrue = akdict['tpep_pickup_datetime'][akdict['store_and_fwd_flag']].size

numTotal == numFalse+numTrue

## Green taxi trip data
This is not fleshed out yet

In [None]:
parse_dates_lst = ['lpep_pickup_datetime','lpep_dropoff_datetime']
gdf = pd.read_csv("../Downloads/green_tripdata_2020-01.csv",header=0,low_memory=False,
                  parse_dates=parse_dates_lst,infer_datetime_format=True)

In [None]:
gdf

In [None]:
gdf.keys()

In [None]:
gdf['lpep_pickup_datetime']

In [None]:
gdf['lpep_dropoff_datetime']

In [None]:
# disconnect or shutdown the server
#ak.disconnect()
#ak.shutdown()