In [1]:
import pyspark
import pandas as pd
import time
from datetime import date
import math

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3051,application_1513605045578_0286,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
import geohash2 as gh2

#Global variables
g = 7 #geohash length
b = 48 # number of time bins per day
# Note: b must evenly divide 60
minutes_per_bin = int((24 / float(b)) * 60)

In [3]:
def date_extractor(date_str,b,minutes_per_bin):
    # Takes a datetime object as a parameter
    # and extracts and returns a tuple of the form: (as per the data specification)
    # (time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend)
    # Split date string into list of date, time
    
    d = date_str.split()
    
    #safety check
    if len(d) != 2:
        return tuple([None,])
    
    # TIME (eg. for 16:56:20 and 15 mins per bin)
    #list of hour,min,sec (e.g. [16,56,20])
    time_list = [int(t) for t in d[1].split(':')]
    
    #safety check
    if len(time_list) != 3:
        return tuple([None,])
    
    # calculate number of minute into the day (eg. 1016)
    num_minutes = time_list[0] * 60 + time_list[1]
    
    # Time of the start of the bin
    time_bin = num_minutes / minutes_per_bin     # eg. 1005
    hour_bin = num_minutes / 60                  # eg. 16
    min_bin = (time_bin * minutes_per_bin) % 60  # eg. 45
    
    #get time_cat
    hour_str = str(hour_bin) if hour_bin / 10 > 0 else "0" + str(hour_bin)  # eg. "16"
    min_str = str(min_bin) if min_bin / 10 > 0 else "0" + str(min_bin)      # eg. "45"
    time_cat = hour_str + ":" + min_str                                     # eg. "16:45"
    
    # Get a floating point representation of the center of the time bin
    time_num = (hour_bin*60 + min_bin + minutes_per_bin / 2.0)/(60*24)      # eg. 0.7065972222222222
    
    time_cos = math.cos(time_num * 2 * math.pi)
    time_sin = math.sin(time_num * 2 * math.pi)
    
    # DATE
    # Parse year, month, day
    date_list = d[0].split('-')
    d_obj = date(int(date_list[0]),int(date_list[1]),int(date_list[2]))
    day_to_str = {0: "Monday",
                  1: "Tuesday",
                  2: "Wednesday",
                  3: "Thursday",
                  4: "Friday",
                  5: "Saturday",
                  6: "Sunday"}
    day_of_week = d_obj.weekday()
    day_cat = day_to_str[day_of_week]
    day_num = (day_of_week + time_num)/7.0
    day_cos = math.cos(day_num * 2 * math.pi)
    day_sin = math.sin(day_num * 2 * math.pi)
    
    year = d_obj.year
    month = d_obj.month
    day = d_obj.day
    
    weekend = 0
    #check if it is the weekend
    if day_of_week in [5,6]:
        weekend = 1
       
    return (year, month, day, time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend)

def data_cleaner(row):
    # takes a tuple (row,g,b,minutes_per_bin) as a parameter and returns a tuple of the form:
    # (time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend,geohash)
    
    #safety check: make sure row has enough features
    if len(row) < 7:
        return None
    
    #extract day of the week and hour
    date_str = row[1]
    clean_date = date_extractor(date_str,b,minutes_per_bin)
    
    # beware the order
    pickup_longitude = float(row[5])
    pickup_latitude = float(row[6])
    
    #get geo hashed pickup and dropoff locatation
    pickup_location = gh2.encode(pickup_latitude, pickup_longitude)
    #pickup_location = (pickup_latitude, pickup_longitude)
    #label for task 3 
    dropoff_location = gh2.encode(float(row[7]), float(row[8]))
    #dropoff_location = (row[7], row[8]) 
    
    #safety check: make sure latitude and longitude are valid, i.e. inside NYC
    if pickup_latitude < 41.1 and pickup_latitude > 40.5 and pickup_longitude < -73.6 and pickup_longitude > -74.1:
        return tuple(list(clean_date)+[pickup_location]+[dropoff_location])
    else:
        return None

In [21]:
raw_green = sc.textFile("hdfs:///Projects/labs/nyc_taxi_data/data/green_tripdata_2013-10.csv")
#raw_yellow = sc.textFile("yellow_tripdata_2017-06.csv")

headers = raw_green.first()

raw2_green = raw_green.filter(lambda row : row != headers).map(lambda line: tuple(line.split(',')))

print(raw2_green.take(20))

gclean_rdd = raw2_green.map(data_cleaner).filter(lambda row: row != None).cache()
#print(gclean_rdd.take(20))

gclean_extracol = gclean_rdd.map(lambda row: row + (1,))
#print(gclean_extracol.take(20))

#gclean_sum = gclean_extracol.groupBy(-2)
#print(gclean_sum.take(20))
#print(gclean_sum.take(20))

#print(headers)

#save to file
#gclean_rdd.saveAsTextFile("hdfs:///Projects/demo_tensorflow_abarose0/Jupyter/small5")

from pyspark.sql.types import *

schemaString = "year month day time_cat time_num time_cos time_sin day_cat day_num day_cos day_sin weekend pickup_location dropoff_location count"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
init_df = spark.createDataFrame(gclean_extracol, schema)
df_summed = init_df.groupBy("pickup_location").agg({"count": "sum"})
init_df.join(df_summed, "pickup_location").sort("sum(count)", ascending=False).show(100)

#init_df.write.format("com.databricks.spark.csv").option("header", "false").save("hdfs:///Projects/ID2223nyctaxi/prepared_taxirides/test2")


#other testing
#pgh.decode('161')
#pgh.encode(41,-74,7)
#print(pgh.encode(40,74,6))

[(u'',), (u'2', u'2013-10-01 00:00:00', u'2013-10-01 15:33:36', u'N', u'1', u'0', u'0', u'-73.903465270996094', u'40.845088958740234', u'1', u'.19', u'42', u'0', u'0.5', u'0', u'0', u'', u'42.5', u'2', u'1', u'', u''), (u'2', u'2013-10-01 00:00:00', u'2013-10-01 09:01:53', u'N', u'1', u'0', u'0', u'-73.937408447265625', u'40.758129119873047', u'1', u'.00', u'2.5', u'0', u'0.5', u'0', u'0', u'', u'3', u'2', u'1', u'', u''), (u'2', u'2013-10-01 00:00:00', u'2013-10-01 16:20:05', u'N', u'1', u'0', u'0', u'0', u'0', u'1', u'.00', u'5.5', u'1', u'0.5', u'0', u'0', u'', u'7', u'2', u'1', u'', u''), (u'2', u'2013-10-01 00:00:00', u'2013-10-01 13:26:24', u'N', u'1', u'0', u'0', u'-73.901992797851563', u'40.763801574707031', u'1', u'3.27', u'10.5', u'0', u'0.5', u'0', u'0', u'', u'11', u'2', u'1', u'', u''), (u'2', u'2013-10-01 00:00:00', u'2013-10-01 18:21:16', u'N', u'1', u'0', u'0', u'-73.937629699707031', u'40.758113861083984', u'2', u'.00', u'6.5', u'1', u'0.5', u'0', u'0', u'', u'8', u'2'

In [10]:
#Test geocode2 package
print(gh2.encode(41,-73,g))


[]
drk14zg

In [None]:
#Other
headers2 = ["Year", "month", "day", "time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "Location"]

raw45 = sc.textFile("hdfs:///Projects/demo_tensorflow_abarose0/Jupyter/small2")
print(raw45.take(1))
#raw46 = raw45.map(lambda x: (x, )).toDF()