# PyGDF Demo on NY Taxi Data Subset

This Notebook is borrowed from MapD (but the MapD portion removed)
Load Taxi data into a GPU DataFrame and expplore some stats

In [None]:
# This automatically time every cell's execution
!pip install ipython-autotime
%load_ext autotime

In [None]:
#Imports 
import numpy as np 
import pandas as pd

import math
from math import floor

#GPU
import pygdf 
from numba import jit 

### Load data from csv

In [None]:
#Load the Taxi data
raw_data = pd.read_csv("./data/nytaxi_200k.csv", parse_dates=True)

In [None]:
# Create a Python GPU DataFrame 
df = pygdf.DataFrame.from_pandas(raw_data)

In [None]:
type(df)

In [None]:
print('nrows', len(df))

Inspect column types

In [None]:
df.dtypes

In [None]:
df.head().to_pandas()

## Groupby lat lon grid

We want to group each record by their pickup location. We will to round the lat lon with the ``round_latlon`` method.  By using ``.applymap``, the rounding method will be compiled into GPU code.

In [None]:
def round_latlon(x):
    scale = 5
    return floor(x * scale) / scale

In [None]:
group_df = df.loc[:, ['pickup_longitude', 'pickup_latitude', 'tip_amount', 'fare_amount']] 

group_df['pickup_longitude'] = group_df['pickup_longitude'].applymap(round_latlon)
group_df['pickup_latitude']  = group_df['pickup_latitude'].applymap(round_latlon)
group_df['tip_ratio']        = group_df['tip_amount'] / group_df['fare_amount']


In [None]:
group_df.dtypes

In [None]:
group_df.sort_values('tip_amount',ascending=False).head().to_pandas()

Here, we run groupby and specify the aggregating methods on each column.

In [None]:
from collections import OrderedDict

# Aggregating methods to apply to each column
aggs = OrderedDict()
aggs['tip_amount'] = 'mean'
aggs['fare_amount'] = ['mean', 'std', 'count']
aggs['tip_ratio'] = 'mean'


grouped_stats = group_df.groupby(['pickup_longitude', 'pickup_latitude']).agg(aggs)
print('total groups', len(grouped_stats))
grouped_stats.head().to_pandas()

Reorder the grouped dataframe by `fare_amount_count`

In [None]:
grouped_stats.sort_values('fare_amount_count', ascending=False).head().to_pandas()

## Groupby payment type

We can also group by categorical columns.

In [None]:
group_pay = df.loc[:, ['payment_type', 'tip_amount', 'fare_amount']]
group_pay['tip_ratio'] = group_df['tip_ratio']

#groupby_payment = group_pay.groupby(['payment_type']).mean()
#groupby_payment.sort_values('tip_ratio', ascending=False).to_pandas()

In [None]:
group_pay.head()