In [1]:
import pyspark
import pandas as pd
import time
from datetime import date
import math




Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
2902,application_1513605045578_0131,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [None]:

#Global variables
g = 7 #geohash length
b = 48 # number of time bins per day
# Note: b must evenly divide 60
minutes_per_bin = int((24 / float(b)) * 60)




In [None]:
def date_extractor(date_str,b,minutes_per_bin):
    # Takes a datetime object as a parameter
    # and extracts and returns a tuple of the form: (as per the data specification)
    # (time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend)
    # Split date string into list of date, time
    
    d = date_str.split()
    
    #safety check
    if len(d) != 2:
        return tuple([None,])
    
    # TIME (eg. for 16:56:20 and 15 mins per bin)
    #list of hour,min,sec (e.g. [16,56,20])
    time_list = [int(t) for t in d[1].split(':')]
    
    #safety check
    if len(time_list) != 3:
        return tuple([None,])
    
    # calculate number of minute into the day (eg. 1016)
    num_minutes = time_list[0] * 60 + time_list[1]
    
    # Time of the start of the bin
    time_bin = num_minutes / minutes_per_bin     # eg. 1005
    hour_bin = num_minutes / 60                  # eg. 16
    min_bin = (time_bin * minutes_per_bin) % 60  # eg. 45
    
    #get time_cat
    hour_str = str(hour_bin) if hour_bin / 10 > 0 else "0" + str(hour_bin)  # eg. "16"
    min_str = str(min_bin) if min_bin / 10 > 0 else "0" + str(min_bin)      # eg. "45"
    time_cat = hour_str + ":" + min_str                                     # eg. "16:45"
    
    # Get a floating point representation of the center of the time bin
    time_num = (hour_bin*60 + min_bin + minutes_per_bin / 2.0)/(60*24)      # eg. 0.7065972222222222
    
    time_cos = math.cos(time_num * 2 * math.pi)
    time_sin = math.sin(time_num * 2 * math.pi)
    
    # DATE
    # Parse year, month, day
    date_list = d[0].split('-')
    d_obj = date(int(date_list[0]),int(date_list[1]),int(date_list[2]))
    day_to_str = {0: "Monday",
                  1: "Tuesday",
                  2: "Wednesday",
                  3: "Thursday",
                  4: "Friday",
                  5: "Saturday",
                  6: "Sunday"}
    day_of_week = d_obj.weekday()
    day_cat = day_to_str[day_of_week]
    day_num = (day_of_week + time_num)/7.0
    day_cos = math.cos(day_num * 2 * math.pi)
    day_sin = math.sin(day_num * 2 * math.pi)
    
    year = d_obj.year
    month = d_obj.month
    day = d_obj.day
    
    weekend = 0
    #check if it is the weekend
    if day_of_week in [5,6]:
        weekend = 1
       
    return (year, month, day, time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend)

def data_cleaner(row):
    # takes a tuple (row,g,b,minutes_per_bin) as a parameter and returns a tuple of the form:
    # (time_cat, time_num, time_cos, time_sin, day_cat, day_num, day_cos, day_sin, weekend,geohash)
    
    #safety check: make sure row has enough features
    if len(row) < 7:
        return None
    
    #extract day of the week and hour
    date_str = row[1]
    clean_date = date_extractor(date_str,b,minutes_per_bin)
    #get geo hash

    #latitude = row[7]
    #longitude = row[8]
    
    (latitude, longitude) = (1234, 1234) #pgh.decode(row[7])
    location = (latitude, longitude)
    
    
    #safety check: make sure latitude and longitude are valid
    #if latitude < 41.1 and latitude > 40.5 and longitude < -73.6 and longitude > -74.1:
    return tuple(list(clean_date)+[row[7].encode('utf-8')])
    #else:
       # return None

In [53]:

raw = sc.textFile("hdfs:///Projects/demo_tensorflow_abarose0/Jupyter/small.csv")
#raw = sc.textFile("yellow_tripdata_2017-06.csv")


headers = raw.first()

raw2 = raw.filter(lambda row : row != headers).map(lambda line: tuple(line.split(',')))


print(raw2.take(2))

gclean_rdd = raw2.map(data_cleaner).filter(lambda row: row != None)

print(gclean_rdd.take(5))
#print(headers)

#save to file
#gclean_rdd.saveAsTextFile("hdfs:///Projects/demo_tensorflow_abarose0/Jupyter/small5")

from pyspark.sql.types import *

schemaString = "Year month day time_cat time_num time_cos time_sin day_cat day_num day_cos day_sin weekend Location"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
raw47 = spark.createDataFrame(gclean_rdd, schema)
raw47.write.format("com.databricks.spark.csv").option("header", "false").save("hdfs:///Projects/demo_tensorflow_abarose0/Jupyter/small7")
print(raw.count())

#other testing
#pgh.decode('161')
#pgh.encode(41,-74,7)
#print(pgh.encode(40,74,6))


[(u'2', u'2017-06-08 07:52:31', u'2017-06-08 08:01:32', u'6', u'1.03', u'1', u'N', u'161', u'140', u'1', u'7.5', u'1', u'0.5', u'1.86', u'0', u'0.3', u'11.16'), (u'2', u'2017-06-08 08:08:18', u'2017-06-08 08:14:00', u'6', u'1.03', u'1', u'N', u'162', u'233', u'1', u'6', u'1', u'0.5', u'2.34', u'0', u'0.3', u'10.14')]
[(2017, 6, 8, '07:30', 0.3229166666666667, -0.44228869021900113, 0.8968727415326884, 'Thursday', 0.47470238095238093, -0.9873940820849713, 0.1582811633251951, 0, '161'), (2017, 6, 8, '08:00', 0.34375, -0.555570233019602, 0.8314696123025455, 'Thursday', 0.47767857142857145, -0.9901811253364456, 0.13979033953549938, 0, '162'), (2017, 6, 8, '08:00', 0.34375, -0.555570233019602, 0.8314696123025455, 'Thursday', 0.47767857142857145, -0.9901811253364456, 0.13979033953549938, 0, '137'), (2017, 6, 29, '15:30', 0.65625, -0.5555702330196022, -0.8314696123025452, 'Thursday', 0.5223214285714286, -0.9901811253364455, -0.13979033953549957, 0, '142'), (2017, 6, 1, '00:00', 0.0104166666666

In [45]:
headers2 = ["Year", "month", "day", "time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "Location"]

raw45 = sc.textFile("hdfs:///Projects/demo_tensorflow_abarose0/Jupyter/small2")
print(raw45.take(1))
#raw46 = raw45.map(lambda x: (x, )).toDF()


[u"(2017, 6, 8, '07:30', 0.3229166666666667, -0.44228869021900113, 0.8968727415326884, 'Thursday', 0.47470238095238093, -0.9873940820849713, 0.1582811633251951, 0, u'161')"]