In [1]:
# Work With Files
from google.cloud import storage
import os

# Useful libraries:
from time import time
import numpy as np
import pandas as pd

# To Plot:
import matplotlib.pyplot as plt
import seaborn as sns

# Pyspark Lib:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

# Preprocess:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MinMaxScaler

# Pysparl ML:
from pyspark.ml.regression import LinearRegression

In [3]:
! pip install -q kaggle

In [4]:
PATH_BUCKET = 'gs://nyc_comp_bk/'
PATH_DATA = '/home/ubuntu/NYC_Taxi/data/'

In [5]:
os.chdir('/home/ubuntu/NYC_Taxi/')

In [6]:
class Work_On_Bucket():
    
    def __init__(self, bucket_name):
        # Get access to the bucket:
        storage_client = storage.Client()
        self.bucket = storage_client.get_bucket(bucket_name)
        
    def get_file_from_bucket(self, file_name, save_path):
        # Download the file:
        blob = self.bucket.blob(file_name)
        blob.download_to_filename(''.join([save_path, file_name]))
            
    def upload_file_to_bucket(self, file_name, folder_path):
        # Upload the File
        object_to_save = self.bucket.blob(file_name)
        object_to_save.upload_from_filename(folder_path + file_name)


### Get the Dataset:

In [7]:
Bucket = Work_On_Bucket('nyc_comp_bk')

In [8]:
# Set kaggle:
! mkdir ~/.kaggle
Bucket.get_file_from_bucket('kaggle.json', '/home/ubuntu/NYC_Taxi/')
! cp /home/ubuntu/NYC_Taxi/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [8]:
# Download The Dataset
!kaggle competitions download -c new-york-city-taxi-fare-prediction

# Unzip the Files
! unzip new-york-city-taxi-fare-prediction.zip -d /home/ubuntu/NYC_Taxi/data/
! rm new-york-city-taxi-fare-prediction.zip

# Upload databses to bucket:
print('Start Uploding!')
Bucket.upload_file_to_bucket('train.csv', PATH_DATA)
Bucket.upload_file_to_bucket('test.csv', PATH_DATA)
print('Succesfully Uploaded!')

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading new-york-city-taxi-fare-prediction.zip to /home/ubuntu/NYC_Taxi
 99%|█████████████████████████████████████▋| 1.55G/1.56G [00:25<00:00, 56.1MB/s]
100%|██████████████████████████████████████| 1.56G/1.56G [00:25<00:00, 66.2MB/s]
Archive:  new-york-city-taxi-fare-prediction.zip
  inflating: /home/ubuntu/NYC_Taxi/data/GCP-Coupons-Instructions.rtf  
  inflating: /home/ubuntu/NYC_Taxi/data/sample_submission.csv  
  inflating: /home/ubuntu/NYC_Taxi/data/test.csv  
  inflating: /home/ubuntu/NYC_Taxi/data/train.csv  


### Preliminary Steps (Load + Checks):

In [9]:
# Load Data:
train = spark.read.load(PATH_BUCKET+"train.csv", format="csv", inferSchema="true", header="true")
test = spark.read.load(PATH_BUCKET+"test.csv", format="csv", inferSchema="true", header="true")
# Load Test (Because pyspark changes the timestamp):
Bucket.get_file_from_bucket('test.csv', '')
original_test = pd.read_csv('test.csv')

In [10]:
# Get DB shape:
ncol = len(train.columns)
nrow = train.count()
print("The shape of the dataset is {:d} rows by {:d} columns".format(nrow, ncol))

The shape of the dataset is 55423856 rows by 8 columns


In [8]:
# Get the schema:
train.printSchema()

root
 |-- key: timestamp (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)



In [25]:
# Show some basic Statistics:
stats = train.select(train.columns[1:]).describe()
stats.toPandas()

Unnamed: 0,summary,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,count,55423856.0,55423856,55423856.0,55423856.0,55423480.0,55423480.0,55423856.0
1,mean,11.345045601663852,,-72.50968444358728,39.919791786888176,-72.5112097297181,39.92068144482884,1.6853799201556816
2,stddev,20.7108321982325,,12.848883381402652,9.642353041994932,12.782196517830773,9.633345796415126,1.327664357095968
3,min,-300.0,2009-01-01 00:00:27 UTC,-3442.059565,-3492.263768,-3442.024565,-3547.886698,0.0
4,max,93963.36,2015-06-30 23:59:54 UTC,3457.625683,3408.789565,3457.62235,3537.132528,208.0


In [12]:
# Check Nulls:
for c in train.columns[2:]:
    nans = train.where(col(c).isNull()).count()
    print('{:s}: {:d}'.format(c, nans))

pickup_datetime: 0
pickup_longitude: 0
pickup_latitude: 0
dropoff_longitude: 376
dropoff_latitude: 376
passenger_count: 0


In [11]:
# Remove Rows with Missing Values:
train = train.na.drop(how='any')

In [9]:
# Check Duplicates:
print('The Duplicates are: {:d}'.format(train.count()-train.distinct().count()))

The Duplicates are: 1650


In [12]:
# Remove Duplicates:
train = train.distinct()

### Create My Base Line:

As Base Line I am going to use a Multiple Linear Regression that takes as input all the scaled (mean=0, sd=1) numerical variables.
As result I get an RMSE = 9.40712 on the Test.

In [13]:
NUMERICAL = ['pickup_longitude',
            'pickup_latitude',
            'dropoff_longitude',
            'dropoff_latitude',
            'passenger_count']
TARGET = 'fare_amount'

In [14]:
# Get the feature Vector:
assembler = VectorAssembler(inputCols=NUMERICAL, outputCol="features")
train_df = assembler.transform(train)
train_df = train_df.select('features', TARGET)

In [15]:
# Scale Data:
scaler = StandardScaler(inputCol="features", outputCol="ScaledFeatures")
scalerModel = scaler.fit(train_df)
train_df = scalerModel.transform(train_df)
train_df = train_df.select('ScaledFeatures', TARGET)

In [18]:
train_df.show(3, truncate=False)

+--------------------------------------------------------------------------------+-----------+
|features                                                                        |fare_amount|
+--------------------------------------------------------------------------------+-----------+
|[-5.7615460522670014,4.226292720830354,-5.791278433779798,4.229979120682077,6.0]|5.0        |
|[-5.758855893096745,4.2236575825491585,-5.788790661013991,4.223286023907211,2.0]|14.5       |
|[-5.760342838673427,4.226309006966249,-5.789770038098104,4.232012218134044,1.0] |6.5        |
+--------------------------------------------------------------------------------+-----------+
only showing top 3 rows



In [19]:
# Run the Linear Regression:
lr = LinearRegression(featuresCol="ScaledFeatures", labelCol=TARGET, maxIter=10)
lr_model = lr.fit(train_df)

# Print Stats:
training_result = lr_model.summary
print("***** Training Set *****")
print("RMSE: {:.3f}".format(training_result.rootMeanSquaredError))
print("MAE: {:.3f}".format(training_result.meanAbsoluteError))
print("R2: {:.3f}".format(training_result.r2))
print("***** Training Set *****")

***** Training Set *****
RMSE: 20.710
MAE: 6.035
R2: 0.000
***** Training Set *****


In [92]:
# Prepare the test:
test_df = assembler.transform(test)
test_df = test_df.select('key', 'features')

test_df = scalerModel.transform(test_df)
test_df = test_df.select(col('key'),col('ScaledFeatures'))

# Make Predictions:
predictions = lr_model.transform(test_df).select('key', 'prediction').withColumnRenamed('prediction','fare_amount').toPandas()

# Prepare the Submission:
submission = pd.concat([original_test['key'], predictions['fare_amount']], axis=1)
submission.to_csv('submission.csv', index=False)

# Submit:
!kaggle competitions submit -c new-york-city-taxi-fare-prediction -f submission.csv -m "First Submission"
!kaggle competitions submissions -c new-york-city-taxi-fare-prediction

100%|█████████████████████████████████████████| 459k/459k [00:01<00:00, 243kB/s]
Successfully submitted to New York City Taxi Fare PredictionfileName        date                 description       status    publicScore  privateScore  
--------------  -------------------  ----------------  --------  -----------  ------------  
submission.csv  2021-06-23 15:15:57  First Submission  complete  9.40712      9.40712       
submission.csv  2021-06-23 15:10:56  First Submission  complete  9.40712      9.40712       
submission.csv  2021-06-23 14:58:58  None              error     None         None          
submission.csv  2021-06-23 14:58:06  First Submission  error     None         None          
submission.csv  2021-06-23 14:55:46  None              error     None         None          
submission.csv  2021-06-23 14:54:33  First Submission  error     None         None          
submission.csv  2021-06-23 14:52:36  None              error     None         None          
submission.csv  2021-0