# BlazingSQL + cuML NYC Taxi Cab Fare Prediction

This demo uses [pubically availible NYC Taxi Cab Data](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction) in order to predict the fare amount (inclusive of tolls) for a taxi ride in New York City given the pickup and dropoff locations. A linear regression model is used from the cuML library in [RAPIDS AI.](https://rapids.ai/) 

# Confirm T4 GPU 

In [0]:
!nvidia-smi

Thu Jul 25 19:48:53 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8    17W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# Install BlazingSQL + cuDF

In [1]:
# !bash -c "$(wget -q https://s3.amazonaws.com/blazingsql-colab/install.sh -O -)"

In [3]:
# !blazingsql status

## Clear Runtime

Before you continue, please go to the 'Runtime' Menu above, and select 'Reset Runtime'. 

# Import Packages

## Warning - New Bug Using BlazingSQL in Google Colab.
*The cell below will error on first run, please Restart Runtime and run again. We are are working to fix this very soon.*


In [0]:
# # Set Environment Variables
# import sys, os
# os.environ["NUMBAPRO_NVVM"] = "/usr/local/cuda/nvvm/lib64/libnvvm.so"
# os.environ["NUMBAPRO_LIBDEVICE"] = "/usr/local/cuda/nvvm/libdevice/"

In [1]:
# Import RAPIDS AI stack
from blazingsql import BlazingContext
import cudf

bc = BlazingContext()

BlazingContext ready


# Get Data

In [2]:
# !wget https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_00.csv
# !wget https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_01.csv
# !wget https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_02.csv
# !wget https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_03.csv

# ETL: Read and Join CSVs


In [3]:
column_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
column_types = ['date64', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']

gdf_00 = cudf.read_csv('taxi_00.csv', delimiter= ',', dtype = column_types, names = column_names)
gdf_01 = cudf.read_csv('taxi_01.csv', delimiter= ',', dtype = column_types, names = column_names)
gdf_02 = cudf.read_csv('taxi_01.csv', delimiter= ',', dtype = column_types, names = column_names)
gdf_03 = cudf.read_csv('taxi_01.csv', delimiter= ',', dtype = column_types, names = column_names)

gdf = cudf.concat([gdf_00,gdf_01, gdf_02, gdf_03])

In [4]:
gdf.head()

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2012-02-02 22:30:19.002,8.9,-73.988708,40.758804,-73.986519,40.737202,1.0
1,2014-09-20 07:19:24.001,4.0,-73.990204,40.746708,-73.994728,40.750515,1.0
2,2013-02-23 07:18:05.001,5.5,-74.016762,40.709438,-74.009003,40.719498,3.0
3,2015-04-18 23:49:27.009,13.5,-74.002708,40.73373,-73.986099,40.734776,1.0
4,2010-03-04 08:15:59.001,10.5,-73.988365,40.737663,-74.012459,40.713932,1.0


# ETL: Create Table

In [5]:
bc.create_table('taxi', gdf)

<pyblazing.apiv2.sql.Table at 0x7fc86c4b20f0>

# ETL: Query Tables for Training Data

In [6]:
X_train = bc.sql('SELECT hour(key) as hours, month(key) as months, year(key) - 2000 as years,  dropoff_longitude - pickup_longitude as longitude_distance, dropoff_latitude - pickup_latitude as latitude_distance, passenger_count FROM main.taxi').get()
X_train_gdf = X_train.columns

In [7]:
# crashes the kernel
# X_train.columns

In [8]:
type(X_train.columns)

cudf.core.dataframe.DataFrame

In [9]:
type(X_train_gdf)

cudf.core.dataframe.DataFrame

In [10]:
X_train_gdf.head()

Unnamed: 0,$f0,$f1,$f2,$f3,$f4,passenger_count
0,22,2,12,0.00219,-0.021603,1.0
1,7,9,14,-0.004524,0.003807,1.0
2,7,2,13,0.007759,0.010059,3.0
3,23,4,15,0.016609,0.001045,1.0
4,8,3,10,-0.024094,-0.023731,1.0


In [11]:
type(X_train)

pyblazing.api.ResultSetHandle

In [12]:
X_train.columns.columns

Index(['$f0', '$f1', '$f2', '$f3', '$f4', 'passenger_count'], dtype='object')

In [13]:
# crashes the kernel
# X_train

In [14]:
X_train_gdf['longitude_distance'] = X_train_gdf['longitude_distance'].fillna(0).astype('float32')
X_train_gdf['latitude_distance'] = X_train_gdf['latitude_distance'].fillna(0).astype('float32')
X_train_gdf['passenger_count'] = X_train_gdf['passenger_count'].fillna(0).astype('float32')
X_train_gdf['months'] = X_train_gdf['months'].astype('float32') 
X_train_gdf['years'] = X_train_gdf['years'].astype('float32') 
X_train_gdf['hours'] = X_train_gdf['hours'].astype('float32')
        
print(X_train_gdf)

#Query dependent variable y
y_train = bc.sql('SELECT fare_amount FROM main.taxi').get()
y_train_gdf = y_train.columns
y_train_gdf = y_train_gdf['fare_amount']


print(y_train_gdf)

KeyError: 'longitude_distance'

# Install cuML on Colab

In [None]:
# !wget -c https://repo.continuum.io/miniconda/Miniconda3-4.5.4-Linux-x86_64.sh
# !chmod +x Miniconda3-4.5.4-Linux-x86_64.sh
# !bash ./Miniconda3-4.5.4-Linux-x86_64.sh -b -f -p /usr/local

In [None]:
# !conda config --append channels conda-forge
# !conda install -q -y --prefix /usr/local -c rapidsai cuml

In [None]:
# import sys
# sys.path.append('/usr/local/lib/python3.6/site-packages/')

# Linear Regression: Train Model

In [None]:
%%time

import cuml
from cuml import LinearRegression

#create model
lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = "eig")
#train model
reg = lr.fit(X_train_gdf,y_train_gdf)
#print results
print("Coefficients:")
print(reg.coef_)
print(" ")
print(" Y intercept:")
print(reg.intercept_)

# Linear Regression: Use Model to Predict Future Taxi Fares 

For this we are using a second dataset with data but no fare amount. We will predict fare_amounts. 

Here is a public link to that file: https://drive.google.com/file/d/1UG5-dXNPsAWZb0bJgquEg1qZsm12jzZW/view?usp=sharing

You will need to download that file and upload it to the colab. 

In [None]:
#Create Test Data Table

column_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
column_types = ['date64', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']

!wget 'https://blazingsql-demos.s3-us-west-1.amazonaws.com/test.csv'

gdf2 = cudf.read_csv('test.csv', delimiter= ',', dtype = column_types, names = column_names)

bc.create_table('test', gdf2)


In [None]:
#Query Test Data Table to Create GDF
X_test = bc.sql('SELECT hour(key) as hours, month(key) as months, year(key) - 2000 as years,  dropoff_longitude - pickup_longitude as longitude_distance, dropoff_latitude - pickup_latitude as latitude_distance , passenger_count FROM main.test').get()
X_test_gdf = X_test.columns

X_test_gdf['longitude_distance'] = X_test_gdf['longitude_distance'].fillna(0).astype('float32')
X_test_gdf['latitude_distance'] = X_test_gdf['latitude_distance'].fillna(0).astype('float32')
X_test_gdf['passenger_count'] = X_test_gdf['passenger_count'].fillna(0).astype('float32')
X_test_gdf['months'] = X_test_gdf['months'].astype('float32') 
X_test_gdf['years'] = X_test_gdf['years'].astype('float32') 
X_test_gdf['hours'] = X_test_gdf['hours'].astype('float32')

print(X_test_gdf) #this is the data we will use to predict future ride costs

In [None]:
# Predict Fare Amounts 
predictions = lr.predict(X_test_gdf)
print(predictions)

In [None]:
#Combine into a table of table points and predictions
X_test_gdf['predicted_fare'] = predictions
print(X_test_gdf)

 ##  Predict Cost from Grand Central Station to Samsung Next NYC at 7:00 AM on May 15th, 2020

In [None]:
samsung_ride = cudf.DataFrame([('hours', 7.0), ('days',15.0), ('months', 5.0), ('years', 20.0), ('longitude_distance', 0.012727), ('latitude_distance', 0.008484), ('passenger_count', 1.0)])

samsung_ride['hours'] = samsung_ride['hours'].astype('float32')
samsung_ride['days'] = samsung_ride['days'].astype('float32')
samsung_ride['months'] = samsung_ride['months'].astype('float32')
samsung_ride['years'] = samsung_ride['years'].astype('float32')
samsung_ride['longitude_distance'] = samsung_ride['longitude_distance'].astype('float32')
samsung_ride['latitude_distance'] = samsung_ride['latitude_distance'].astype('float32')
samsung_ride['passenger_count'] = samsung_ride['passenger_count'].astype('float32')

samsung_prediction = lr.predict(samsung_ride)
print(samsung_prediction)