In [2]:
# Work With Files
from google.cloud import storage
import os

# Useful libraries:
from time import time()
import numpy as np
import pandas as pd

# To Plot:
import matplotlip.pyplot as plt
import seaborn as sns

# Pyspark Lib:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [3]:
! pip install -q kaggle

In [4]:
PATH_BUCKET = 'gs://nyc_comp_bk/'
PATH_DATA = '/home/ubuntu/NYC_Taxi/data/'

In [5]:
os.chdir('/home/ubuntu/NYC_Taxi/')

In [6]:
class Work_On_Bucket():
    
    def __init__(self, bucket_name):
        # Get access to the bucket:
        storage_client = storage.Client()
        self.bucket = storage_client.get_bucket(bucket_name)
        
    def get_file_from_bucket(self, file_name, save_path):
        # Download the file:
        blob = self.bucket.blob(file_name)
        blob.download_to_filename(''.join([save_path, file_name]))
            
    def upload_file_to_bucket(self, file_name, folder_path):
        # Upload the File
        object_to_save = self.bucket.blob(file_name)
        object_to_save.upload_from_filename(folder_path + file_name)


### Get the Dataset:

In [7]:
Bucket = Work_On_Bucket('nyc_comp_bk')

In [8]:
# Set kaggle:
! mkdir ~/.kaggle
Bucket.get_file_from_bucket('kaggle.json', '/home/ubuntu/NYC_Taxi/')
! cp /home/ubuntu/NYC_Taxi/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# Download The Dataset
!kaggle competitions download -c new-york-city-taxi-fare-prediction

# Unzip the Files
! unzip new-york-city-taxi-fare-prediction.zip -d /home/ubuntu/NYC_Taxi/data/
! rm new-york-city-taxi-fare-prediction.zip

# Upload databses to bucket:
print('Start Uploding!')
Bucket.upload_file_to_bucket('train.csv', PATH_DATA)
Bucket.upload_file_to_bucket('test.csv', PATH_DATA)
print('Succesfully Uploaded!')

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading new-york-city-taxi-fare-prediction.zip to /home/ubuntu/NYC_Taxi
 99%|█████████████████████████████████████▋| 1.55G/1.56G [00:25<00:00, 56.1MB/s]
100%|██████████████████████████████████████| 1.56G/1.56G [00:25<00:00, 66.2MB/s]
Archive:  new-york-city-taxi-fare-prediction.zip
  inflating: /home/ubuntu/NYC_Taxi/data/GCP-Coupons-Instructions.rtf  
  inflating: /home/ubuntu/NYC_Taxi/data/sample_submission.csv  
  inflating: /home/ubuntu/NYC_Taxi/data/test.csv  
  inflating: /home/ubuntu/NYC_Taxi/data/train.csv  


### Preliminary Steps (Load + Checks):

In [9]:
# Load Data:
train = spark.read.load(PATH_BUCKET+"train.csv", format="csv", inferSchema="true", header="true")
test = spark.read.load(PATH_BUCKET+"test.csv", format="csv", inferSchema="true", header="true")

In [11]:
# Get the schema:
train.printSchema()

root
 |-- key: timestamp (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- passenger_count: integer (nullable = true)



In [12]:
# Get DB shape:
ncol = len(train.columns)
nrow = train.count()
print("The shape of the dataset is {:d} rows by {:d} columns".format(nrow, ncol))

The shape of the dataset is 55423856 rows by 8 columns


In [14]:
# Show some basic Statistics:
stats = train.select(train.columns[1:]).describe()
stats.toPandas()

Unnamed: 0,summary,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,count,55423856.0,55423856,55423856.0,55423856.0,55423480.0,55423480.0,55423856.0
1,mean,11.345045601663852,,-72.50968444358728,39.91979178688818,-72.51120972971809,39.92068144482885,1.6853799201556816
2,stddev,20.7108321982325,,12.848883381402656,9.642353041994935,12.782196517830773,9.633345796415126,1.327664357095968
3,min,-300.0,2009-01-01 00:00:27 UTC,-3442.059565,-3492.263768,-3442.024565,-3547.886698,0.0
4,max,93963.36,2015-06-30 23:59:54 UTC,3457.625683,3408.789565,3457.62235,3537.132528,208.0


In [39]:
# Check Nulls:

train.select(train.columns[1:]).where(col().isNull()).count()

Unnamed: 0,summary,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,count,55423856.0,55423856,55423856.0,55423856.0,55423480.0,55423480.0,55423856.0
1,mean,11.345045601663854,,-72.50968444358726,39.91979178688818,-72.51120972971812,39.92068144482884,1.6853799201556816
2,stddev,20.7108321982325,,12.84888338140265,9.642353041994934,12.782196517830776,9.633345796415126,1.3276643570959683
3,min,-300.0,2009-01-01 00:00:27 UTC,-3442.059565,-3492.263768,-3442.024565,-3547.886698,0.0
4,max,93963.36,2015-06-30 23:59:54 UTC,3457.625683,3408.789565,3457.62235,3537.132528,208.0


In [41]:
col(train.fare_amount)

NameError: name 'col' is not defined