## BlazingSQL + cuDF

Built based on the Apache Arrow columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data.

cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming.
    
[GitHub](https://github.com/rapidsai/cudf) | [Welcome Notebook](../welcome.ipynb#cuDF---GPU-DataFrame-Library)

In [1]:
from blazingsql import BlazingContext

# initialize BlazingContext to launch a BSQL session
bc = BlazingContext()

BlazingContext ready


In [2]:
import os

# tag path to data directory
data_dir = f'{os.getcwd().split("/intro_notebooks")[0]}/data'

# create a BlazingSQL table from any file w/ .create_table(table_name, file_path)
bc.create_table('taxi', f'{data_dir}/sample_taxi.csv', header=0)

<pyblazing.apiv2.context.BlazingTable at 0x7fcbcef9d510>

In [3]:
# keep results DataFrame as variable
gdf = bc.sql('select * from taxi where trip_distance < 10')

In [4]:
# utilize DataFrame methods like .head() or .tail()
gdf.tail(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_x,pickup_y,dropoff_x,dropoff_y
731204,1,2015-01-28 11:11:43,2015-01-28 11:35:15,1,9.1,1,N,1,28.5,0.0,0.5,6.92,5.33,0.3,41.55,-8223703.602,66.740356,-8233243.793,66.732021
731205,1,2015-01-28 11:11:44,2015-01-28 11:21:56,1,1.5,1,N,2,8.5,0.0,0.5,0.0,0.0,0.3,9.3,-8232741.006,66.735944,-8234567.002,66.711625


In [5]:
# tell me about the taxi rides under 10 miles
gdf.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_x,pickup_y,dropoff_x,dropoff_y
count,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0,731206.0
mean,1.525141,1.683695,2.110928,1.008933,1.388211,10.023943,0.315833,0.499003,1.300654,0.08512,0.283192,12.523875,-8235333.0,66.69961,-8235131.0,66.70019
std,0.499368,1.342609,1.735554,0.509256,0.497665,5.848819,0.366803,0.025225,1.621419,0.714763,0.068992,7.037828,2397.015,0.043212,2541.876,0.048194
min,1.0,0.0,0.0,1.0,1.0,-52.0,-1.0,-0.5,-81.0,-5.33,0.0,-116.22,-8254080.0,66.555036,-8254078.0,66.555036
25%,1.0,1.0,0.97,1.0,1.0,6.0,0.0,0.5,0.0,0.0,0.3,8.0,-8236762.0,66.670578,-8236706.0,66.669526
50%,2.0,1.0,1.57,1.0,1.0,8.5,0.0,0.5,1.0,0.0,0.3,10.8,-8235663.0,66.702066,-8235514.0,66.70275
75%,2.0,2.0,2.62,1.0,2.0,12.0,0.5,0.5,2.0,0.0,0.3,14.8,-8234246.0,66.728564,-8233885.0,66.730946
max,2.0,9.0,9.99,99.0,4.0,900.0,7.0,0.5,200.0,117.82,0.3,900.3,-8210362.0,66.870206,-8210008.0,66.870153


In [6]:
# or convert from cuDF .to_pandas() for access to pandas methods like .sample()
gdf.to_pandas().sample(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_x,pickup_y,dropoff_x,dropoff_y
42801,1,2015-01-25 19:06:31,2015-01-25 19:09:10,1,0.7,1,N,2,4.5,0.0,0.5,0.0,0.0,0.3,5.3,-8232352.876,66.758359,-8233057.796,66.741597
169056,1,2015-01-03 18:40:54,2015-01-03 18:46:40,1,0.5,1,N,2,5.5,0.0,0.5,0.0,0.0,0.0,6.3,-8234464.237,66.685867,-8235027.323,66.696994
652904,2,2015-01-05 07:03:57,2015-01-05 07:09:07,1,0.87,1,N,1,5.5,0.0,0.5,1.1,0.0,0.3,7.4,-8235545.396,66.67604,-8235216.717,66.695078


In [7]:
gdf[['passenger_count', 'total_amount']]

Unnamed: 0,passenger_count,total_amount
0,1,17.05
1,1,17.80
2,1,10.80
3,1,4.80
4,1,16.30
...,...,...
731201,1,14.80
731202,1,12.80
731203,1,8.50
731204,1,41.55


In [8]:
gdf.loc[(gdf['passenger_count'] > 1) & (gdf['trip_distance'] < 10)]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,pickup_x,pickup_y,dropoff_x,dropoff_y
7,1,2015-01-10 20:33:39,2015-01-10 20:42:20,3,0.80,1,N,1,7.0,0.5,0.5,1.66,0.00,0.3,9.96,-8237938.725,66.661609,-8237086.877,66.646218
8,1,2015-01-10 20:33:40,2015-01-10 20:40:44,2,0.90,1,N,1,6.5,0.5,0.5,1.55,0.00,0.3,9.35,-8236037.990,66.728196,-8236074.511,66.711286
15,2,2015-01-15 19:05:40,2015-01-15 19:21:00,5,2.83,1,N,2,12.5,1.0,0.5,0.00,0.00,0.3,14.30,-8234158.488,66.701202,-8232646.734,66.765456
16,2,2015-01-15 19:05:40,2015-01-15 19:28:18,5,8.33,1,N,1,26.0,1.0,0.5,8.08,5.33,0.3,41.21,-8222398.228,66.731412,-8232378.355,66.763336
18,2,2015-01-15 19:05:41,2015-01-15 19:20:22,2,7.13,1,N,1,21.5,1.0,0.5,4.50,0.00,0.3,27.80,-8223667.082,66.740138,-8232340.986,66.630986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731180,2,2015-01-26 08:56:35,2015-01-26 09:10:50,5,1.21,1,N,1,10.0,0.0,0.5,2.00,0.00,0.3,12.80,-8235459.617,66.712511,-8234224.735,66.719906
731183,2,2015-01-15 22:26:52,2015-01-15 22:40:52,5,3.92,1,N,1,14.5,0.5,0.5,3.75,0.00,0.3,19.55,-8236620.611,66.642372,-8235840.952,66.564779
731186,1,2015-01-07 23:07:01,2015-01-07 23:13:12,4,1.00,1,N,1,6.0,0.5,0.5,1.45,0.00,0.3,8.75,-8238085.653,66.656741,-8236739.512,66.648231
731193,1,2015-01-28 11:11:40,2015-01-28 11:19:31,2,1.10,1,N,1,7.0,0.0,0.5,1.56,0.00,0.3,9.36,-8235440.083,66.683297,-8236826.141,66.664516
