# Pandas and Arkouda

read csv into pandas and then look at it then put some data frame columns into arkouda

New York City Taxi Data
----------------------------------
Yellow Trips Data Dictionary https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

NYC Yellow Taxi Trip Records Jan 2020 https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv

Green Trips Data Dictionary https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_green.pdf

NYC Green  Taxi Trip Records Jan 2020 https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2020-01.csv

In [None]:
import arkouda as ak
ak.connect(connect_url="tcp://localhost:5555")

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

## Yellow taxi trip data

In [None]:
parse_dates_lst = ['tpep_pickup_datetime','tpep_dropoff_datetime']
ydf = pd.read_csv("../Downloads/yellow_tripdata_2020-01.csv",header=0,low_memory=False,
                  parse_dates=parse_dates_lst,infer_datetime_format=True)

In [None]:
ydf

In [None]:
ydf.keys()

In [None]:
ydf['tpep_pickup_datetime']

In [None]:
ydf['tpep_dropoff_datetime']

In [None]:
# take delta for ride duration
ride_duration = ydf['tpep_dropoff_datetime'] - ydf['tpep_pickup_datetime']
# pull out ride duration in minutes
ride_duration = ride_duration.dt.seconds / 60 # in minutes
print("min = ", ride_duration.min(),"max = ", ride_duration.max())
print("mean = ",ride_duration.mean(),"stdev = ",ride_duration.std(),"median =",ride_duration.median())
# how long was the maximum ride to the next integer minute
max_ride = math.ceil(ride_duration.max())
print("max_ride = ", max_ride)

nBins = max_ride

cnts,bin_edges = np.histogram(ride_duration, bins=nBins)
print(cnts.size,     "cnts      = ", cnts)
print(bin_edges.size,"bin edges = ", bin_edges)

In [None]:
plt.hist(ride_duration,bins=nBins)
plt.yscale('log')
plt.xscale('linear')
plt.show()

In [None]:
print(ydf['trip_distance'].min(), ydf['trip_distance'].max())
print(ydf['trip_distance'].mean(), ydf['trip_distance'].std(), ydf['trip_distance'].median())

plt.figure(figsize=(8,6))
plt.hist(ydf['trip_distance'],bins=2000)
#ax = plt.gca()
#ax.set_xlim((ydf['trip_distance'].min(),ydf['trip_distance'].max()))
plt.yscale('log')
plt.xscale('log')
plt.show()

In [None]:
# data in a pandas series is heald in the values field
# the astype() method converts to desired data type in this cas np.int64
ydf['tpep_pickup_datetime'].astype(np.int64).values

In [None]:
# the index for the series is held in the index field
ydf['tpep_pickup_datetime'].index

In [None]:
ydf['store_and_fwd_flag'] == 'Y'

In [None]:
# put data frame columns into arkouda server
# convert some columns into data types the server can understand
akdf = {}
for cname in ydf.keys():
    print(cname, " : ", ydf[cname].dtype)
    # int64 and float64 go over fine
    if ydf[cname].dtype in [np.int64, np.float64]:
        akdf[cname] = ak.array(ydf[cname].values)
    # time needs to be converted to int64
    elif ydf[cname].dtype in ["datetime64[ns]"]:
        akdf[cname] = ak.array(ydf[cname].astype(np.int64).values)
    # store_and_forward needs to be converted to something, why not bool?
    # valid values are ['Y', 'N', NaN]
    elif cname == 'store_and_fwd_flag':
        akdf[cname] = ak.array(ydf[cname].values == 'Y')
    # something I don't understand how to convert to a server data type
    else:
        print("don't know how to convert ", ydf[cname].dtype, " !!!")

In [None]:
# print out the arkouda server symbol table
print(ak.info(ak.AllSymbols))

In [None]:
# which keys made it over to the server
akdf.keys()

In [None]:
# how many records made it to the server?
akdf['tpep_pickup_datetime'].size

In [None]:
# use the store_and_forward column to index tpep_pickup_datetime
# see how many time was false
akdf['tpep_pickup_datetime'][~akdf['store_and_fwd_flag']].size

In [None]:
# use the store_and_forward column to index tpep_pickup_datetime
# see how many time was true
akdf['tpep_pickup_datetime'][akdf['store_and_fwd_flag']].size

## Green taxi trip data

In [None]:
parse_dates_lst = ['lpep_pickup_datetime','lpep_dropoff_datetime']
gdf = pd.read_csv("../Downloads/green_tripdata_2020-01.csv",header=0,low_memory=False,
                  parse_dates=parse_dates_lst,infer_datetime_format=True)

In [None]:
gdf

In [None]:
gdf.keys()

In [None]:
gdf['lpep_pickup_datetime']

In [None]:
gdf['lpep_dropoff_datetime']