In [1]:
import pandas as pd

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import *

#Feel free to add other libraries from pyspark

# conf = SparkConf().setAppName(app_name)
sc = SparkContext.getOrCreate()
sc.setLogLevel("OFF")

ss = SparkSession.builder.getOrCreate()

In [2]:
sc

In [3]:
Parking_meters = 'Parking_meters.csv'
transcation_input = 'output1month.csv'

In [4]:
schema = StructType([StructField("POST_ID", StringType(), True), 
                    StructField("MS_ID", StringType(), True),
                    StructField("MS_SPACEID", StringType(), True),
                    StructField("CAP_COL", StringType(), True), 
                    StructField("METER_TYPE", StringType(), True),
                    StructField("SMART_METE", StringType(), True),
                    StructField("ACTIVESENS", StringType(), True),
                    StructField("JURISDICTI", StringType(), True),
                    StructField("ON_OFF_STR", StringType(), True),
                    StructField("OSP_ID", StringType(), True),
                    StructField("STREET_NUM", StringType(), True),
                    StructField("STREETNAME", StringType(), True),
                    StructField("STREET_SEG", StringType(), True),
                    StructField("RATEAREA", StringType(), True),
                    StructField("SFPARKAREA", StringType(), True),
                    StructField("LOCATION", StringType(), True)])

parking_meters = ss.read.csv(Parking_meters, schema = schema,header=True)
print(len(parking_meters.columns))
parking_meters.show(1)

16
+---------+-----+----------+-------+----------+----------+----------+----------+----------+------+----------+----------+----------+--------+----------+--------------------+
|  POST_ID|MS_ID|MS_SPACEID|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|JURISDICTI|ON_OFF_STR|OSP_ID|STREET_NUM|STREETNAME|STREET_SEG|RATEAREA|SFPARKAREA|            LOCATION|
+---------+-----+----------+-------+----------+----------+----------+----------+----------+------+----------+----------+----------+--------+----------+--------------------+
|401-06340|    -|         0|   Grey|        SS|         N|         N|     SFMTA|        ON|     0|       634|  ELLIS ST|   5177000|  Area 3|          |(37.78436, -122.4...|
+---------+-----+----------+-------+----------+----------+----------+----------+----------+------+----------+----------+----------+--------+----------+--------------------+
only showing top 1 row



In [6]:
parking_meters_selected = parking_meters.select(['POST_ID','MS_ID','MS_SPACEID','CAP_COL','METER_TYPE','SMART_METE','ACTIVESENS','ON_OFF_STR'])
parking_meters_selected.show(3)

+---------+-----+----------+-------+----------+----------+----------+----------+
|  POST_ID|MS_ID|MS_SPACEID|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|
+---------+-----+----------+-------+----------+----------+----------+----------+
|401-06340|    -|         0|   Grey|        SS|         N|         N|        ON|
|104-03190|    -|         0|   Grey|        SS|         N|         Y|        ON|
|352-04350|    -|         0|   Grey|        SS|         N|         N|        ON|
+---------+-----+----------+-------+----------+----------+----------+----------+
only showing top 3 rows



In [7]:
parking_meters_selected.filter(parking_meters_selected.METER_TYPE=='MS').show(10)

+---------+---------+----------+-------+----------+----------+----------+----------+
|  POST_ID|    MS_ID|MS_SPACEID|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|
+---------+---------+----------+-------+----------+----------+----------+----------+
|818-01630|818-01001|        63|   Grey|        MS|         Y|         Y|        ON|
|324-09341|324-09341|         1|  Black|        MS|         N|         N|        ON|
|831-05460|831-05006|        46|   Grey|        MS|         Y|         Y|        ON|
|826-06290|826-06001|        29|   Grey|        MS|         Y|         N|        ON|
|440-08401|440-08400|         1|  Black|        MS|         N|         N|        ON|
|462-05131|462-05130|         1|  Black|        MS|         N|         N|        ON|
|836-00330|836-00003|        33|   Grey|        MS|         Y|         N|        ON|
|568-17441|568-17440|         1|  Black|        MS|         Y|         N|        ON|
|818-01590|818-01001|        59|   Grey|        MS|         Y|   

In [8]:
# one postid per pay station even for multi space parking, weird
parking_meters_selected.filter(parking_meters_selected.METER_TYPE=='MS')\
                        .groupBy('POST_ID')\
                        .count()\
                        .orderBy('count',ascending=False)\
                        .show(5)



+---------+-----+
|  POST_ID|count|
+---------+-----+
|855-00270|    1|
|869-00410|    1|
|869-00440|    1|
|681-00120|    1|
|568-02211|    1|
+---------+-----+
only showing top 5 rows



In [9]:
parking_meters_cars = parking_meters_selected.filter((parking_meters_selected.CAP_COL == 'Green') | (parking_meters_selected.CAP_COL == 'Grey'))
print(parking_meters_cars.count())
parking_meters_cars.show(5)



22426
+---------+-----+----------+-------+----------+----------+----------+----------+
|  POST_ID|MS_ID|MS_SPACEID|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|
+---------+-----+----------+-------+----------+----------+----------+----------+
|401-06340|    -|         0|   Grey|        SS|         N|         N|        ON|
|104-03190|    -|         0|   Grey|        SS|         N|         Y|        ON|
|352-04350|    -|         0|   Grey|        SS|         N|         N|        ON|
|116-03980|    -|         0|   Grey|        SS|         N|         N|        ON|
|224-27570|    -|         0|   Grey|        SS|         N|         N|        ON|
+---------+-----+----------+-------+----------+----------+----------+----------+
only showing top 5 rows



In [10]:
parking_meters_tojoin = parking_meters_cars.select(['POST_ID','CAP_COL','METER_TYPE','SMART_METE','ACTIVESENS','ON_OFF_STR'])
parking_meters_tojoin.show(5)




+---------+-------+----------+----------+----------+----------+
|  POST_ID|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|
+---------+-------+----------+----------+----------+----------+
|401-06340|   Grey|        SS|         N|         N|        ON|
|104-03190|   Grey|        SS|         N|         Y|        ON|
|352-04350|   Grey|        SS|         N|         N|        ON|
|116-03980|   Grey|        SS|         N|         N|        ON|
|224-27570|   Grey|        SS|         N|         N|        ON|
+---------+-------+----------+----------+----------+----------+
only showing top 5 rows



In [11]:
568-28390
parking_meters_cars.filter(parking_meters_cars.POST_ID=='568-28390').show()


+---------+-----+----------+-------+----------+----------+----------+----------+
|  POST_ID|MS_ID|MS_SPACEID|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|
+---------+-----+----------+-------+----------+----------+----------+----------+
|568-28390|    -|         0|   Grey|        SS|         N|         N|        ON|
+---------+-----+----------+-------+----------+----------+----------+----------+



In [13]:
schema = StructType([StructField("POST_ID", StringType(), True), 
                    StructField("MS_ID", StringType(), True),
                    StructField("MS_SPACEID", StringType(), True),
                    StructField("CAP_COL", StringType(), True), 
                    StructField("METER_TYPE", StringType(), True),
                    StructField("SMART_METE", StringType(), True),
                    StructField("ACTIVESENS", StringType(), True),
                    StructField("JURISDICTI", StringType(), True),
                    StructField("ON_OFF_STR", StringType(), True),
                    StructField("OSP_ID", StringType(), True),
                    StructField("STREET_NUM", StringType(), True),
                    StructField("STREETNAME", StringType(), True),
                    StructField("STREET_SEG", StringType(), True),
                    StructField("RATEAREA", StringType(), True),
                    StructField("SFPARKAREA", StringType(), True),
                    StructField("LOCATION", StringType(), True)])

trans = ss.read.csv(transcation_input,header=True, inferSchema='true')
print(len(trans.columns))
trans.printSchema()
trans.show(1)

7
root
 |-- POST_ID: string (nullable = true)
 |-- STREET_BLOCK: string (nullable = true)
 |-- PAYMENT_TYPE: string (nullable = true)
 |-- SESSION_START_DT: string (nullable = true)
 |-- SESSION_END_DT: string (nullable = true)
 |-- METER_EVENT_TYPE: string (nullable = true)
 |-- GROSS_PAID_AMT: double (nullable = true)

+---------+--------------+------------+--------------------+--------------------+----------------+--------------+
|  POST_ID|  STREET_BLOCK|PAYMENT_TYPE|    SESSION_START_DT|      SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|
+---------+--------------+------------+--------------------+--------------------+----------------+--------------+
|490-22190|IRVING ST 2200|        CASH|19-OCT-18 11.00.0...|19-OCT-18 11.09.2...|              NS|          0.35|
+---------+--------------+------------+--------------------+--------------------+----------------+--------------+
only showing top 1 row



In [14]:
## you can add after it expire 
## you can also add before it expires and time shows they overlap
trans.filter(trans.METER_EVENT_TYPE=='AT').filter(trans.POST_ID=='568-28390').show(10)

+---------+---------------+------------+--------------------+--------------------+----------------+--------------+
|  POST_ID|   STREET_BLOCK|PAYMENT_TYPE|    SESSION_START_DT|      SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|
+---------+---------------+------------+--------------------+--------------------+----------------+--------------+
|568-28390|MISSION ST 2800|        CASH|22-OCT-18 10.55.0...|22-OCT-18 11.10.0...|              AT|           0.1|
|568-28390|MISSION ST 2800|        CASH|08-OCT-18 04.42.2...|08-OCT-18 04.49.0...|              AT|          0.25|
|568-28390|MISSION ST 2800|        CASH|25-OCT-18 10.33.0...|25-OCT-18 10.48.0...|              AT|          0.25|
|568-28390|MISSION ST 2800|        CASH|27-OCT-18 10.40.2...|27-OCT-18 10.55.2...|              AT|          0.05|
|568-28390|MISSION ST 2800|        CASH|12-OCT-18 11.53.5...|12-OCT-18 12.08.5...|              AT|          0.25|
|568-28390|MISSION ST 2800|        CASH|22-OCT-18 10.54.5...|22-OCT-18 11.09.5..

In [15]:
trans_1 = trans.join(parking_meters_tojoin,'POST_ID','left_outer')
trans_1.show(5)

+---------+---------------+------------+--------------------+--------------------+----------------+--------------+-------+----------+----------+----------+----------+
|  POST_ID|   STREET_BLOCK|PAYMENT_TYPE|    SESSION_START_DT|      SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|
+---------+---------------+------------+--------------------+--------------------+----------------+--------------+-------+----------+----------+----------+----------+
|490-22190| IRVING ST 2200|        CASH|19-OCT-18 11.00.0...|19-OCT-18 11.09.2...|              NS|          0.35|   Grey|        SS|         N|         N|        ON|
|823-00160|  CHESTNUT ST 0|        CASH|13-OCT-18 03.30.3...|13-OCT-18 04.33.2...|              AT|          0.05|   Grey|        MS|         Y|         N|        ON|
|490-21250| IRVING ST 2100|        CASH|29-OCT-18 12.40.3...|29-OCT-18 01.07.1...|              NS|           1.0|   Grey|        SS|         N|         N|        ON

## Failed parse

In [None]:
from pyspark.sql.functions import udf

# Use udf to define a row-at-a-time udf
# @udf('double')
# Input/output are both a single double value
def plus_one(v):
      return v + 1

df.withColumn('v2', plus_one(df.v))

In [89]:
trans_1.select('SESSION_START_DT').show(5)

+--------------------+
|    SESSION_START_DT|
+--------------------+
|19-OCT-18 11.00.0...|
|13-OCT-18 03.30.3...|
|29-OCT-18 12.40.3...|
|30-OCT-18 01.36.1...|
|05-OCT-18 09.42.0...|
+--------------------+
only showing top 5 rows



In [111]:
changeTime(trans_1.select('SESSION_START_DT')).show(1)

TypeError: Invalid argument, not a string or column: DataFrame[SESSION_START_DT: string] of type <class 'pyspark.sql.dataframe.DataFrame'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [113]:
from datetime import datetime
from pyspark.sql.functions import udf
# datetime_object = datetime.strptime('31-OCT-18 12.59.59 PM', '%d-%b-%y %I.%M.%S %p')
# change_time = udf(lambda x : datetime.strptime(x, '%d-%b-%y %I.%M.%S %p'))

def toTimeSafe(inval):
  try:
    return datetime.strptime(inval, '%d-%b-%y %I.%M.%S %p')
  except ValueError:
    print(inval)
    return None

changeTime = udf(toTimeSafe)

# changeTime(trans_1.select('SESSION_START_DT')).show(1)

# tran_tmp = trans_1.withColumn('SESSION_START_DT_',changeTime(trans_1['SESSION_START_DT']).cast(TimestampType()))\
#                 .withColumn('SESSION_END_DT_',changeTime(trans_1['SESSION_END_DT']).cast(TimestampType()))

tran_tmp = trans_1.withColumn('SESSION_START_DT_',changeTime(trans_1['SESSION_START_DT']).cast(TimestampType()))
tran_tmp.printSchema()
tran_tmp.show(5)

root
 |-- POST_ID: string (nullable = true)
 |-- STREET_BLOCK: string (nullable = true)
 |-- PAYMENT_TYPE: string (nullable = true)
 |-- SESSION_START_DT: string (nullable = true)
 |-- SESSION_END_DT: string (nullable = true)
 |-- METER_EVENT_TYPE: string (nullable = true)
 |-- GROSS_PAID_AMT: double (nullable = true)
 |-- CAP_COL: string (nullable = true)
 |-- METER_TYPE: string (nullable = true)
 |-- SMART_METE: string (nullable = true)
 |-- ACTIVESENS: string (nullable = true)
 |-- ON_OFF_STR: string (nullable = true)
 |-- SESSION_START_DT_: timestamp (nullable = true)

+---------+---------------+------------+--------------------+--------------------+----------------+--------------+-------+----------+----------+----------+----------+-----------------+
|  POST_ID|   STREET_BLOCK|PAYMENT_TYPE|    SESSION_START_DT|      SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|SESSION_START_DT_|
+---------+---------------+------------+----------

In [106]:
def toTimeSafe(inval):
  try:
    return datetime.strptime(inval, '%d-%b-%y %I.%M.%S %p')
  except ValueError:
    return None

changeTime = udf(toTimeSafe)



In [90]:
s = '31-OCT-18 12.59.59 PM'
toTimeSafe(s)

datetime.datetime(2018, 10, 31, 12, 59, 59)

## finally worked

In [16]:
def toTimeSafe(inval):
    try:
        return datetime.strptime(inval, "%d-%b-%y %I.%M.%S %p")
    except ValueError:
        return None

def toFloatSafe(inval):
    try:
        return float(inval)
    except ValueError:
        return None

In [17]:
def stringToPost(row):
    r = row.split(",")
    return Row(
    r[0].lstrip('\"').rstrip('\"'),\
      r[1].lstrip('\"').rstrip('\"'),\
      r[2].lstrip('\"').rstrip('\"'),\
      toTimeSafe(r[3].lstrip('\"').rstrip('\"')),\
      toTimeSafe(r[4].lstrip('\"').rstrip('\"')),\
      r[5].lstrip('\"').rstrip('\"'),\
      toFloatSafe(r[6].lstrip('\"').rstrip('\"'))
    )

In [19]:
trans = sc.textFile(transcation_input)
trans.take(5)

['POST_ID,STREET_BLOCK,PAYMENT_TYPE,SESSION_START_DT,SESSION_END_DT,METER_EVENT_TYPE,GROSS_PAID_AMT',
 '490-22190,IRVING ST 2200,CASH,19-OCT-18 11.00.01 AM,19-OCT-18 11.09.21 AM,NS,0.35',
 '823-00160,CHESTNUT ST 0,CASH,13-OCT-18 03.30.34 PM,13-OCT-18 04.33.25 PM,AT,0.05',
 '490-21250,IRVING ST 2100,CASH,29-OCT-18 12.40.30 PM,29-OCT-18 01.07.10 PM,NS,1.0',
 '440-37030,GEARY BLVD 3700,CREDIT CARD,30-OCT-18 01.36.13 PM,30-OCT-18 01.49.33 PM,NS,0.5']

In [20]:
trans = sc.textFile(transcation_input)
tmp = trans.first()
trans = trans.filter(lambda x: x!= tmp)
trans.take(2)

['490-22190,IRVING ST 2200,CASH,19-OCT-18 11.00.01 AM,19-OCT-18 11.09.21 AM,NS,0.35',
 '823-00160,CHESTNUT ST 0,CASH,13-OCT-18 03.30.34 PM,13-OCT-18 04.33.25 PM,AT,0.05']

In [21]:

trans = sc.textFile(transcation_input)
tmp = trans.first()
trans = trans.filter(lambda x: x!= tmp)
tran_df = trans.map(lambda x: stringToPost(x))

In [23]:
from datetime import datetime

schema = StructType([ StructField("POST_ID", StringType(), True),
                      StructField("STREET_BLOCK", StringType(), True),
                      StructField("PAYMENT_TYPE", StringType(), True),
                      StructField("SESSION_START_DT", TimestampType(), True),
                      StructField("SESSION_END_DT", TimestampType(), True),
                      StructField("METER_EVENT_TYPE", StringType(), True),
                      StructField("GROSS_PAID_AMT", DoubleType(), True),
                    ])
transaction_df = ss.createDataFrame(tran_df, schema)
transaction_df.show(10)

+---------+--------------------+------------+-------------------+-------------------+----------------+--------------+
|  POST_ID|        STREET_BLOCK|PAYMENT_TYPE|   SESSION_START_DT|     SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|
+---------+--------------------+------------+-------------------+-------------------+----------------+--------------+
|490-22190|      IRVING ST 2200|        CASH|2018-10-19 11:00:01|2018-10-19 11:09:21|              NS|          0.35|
|823-00160|       CHESTNUT ST 0|        CASH|2018-10-13 15:30:34|2018-10-13 16:33:25|              AT|          0.05|
|490-21250|      IRVING ST 2100|        CASH|2018-10-29 12:40:30|2018-10-29 13:07:10|              NS|           1.0|
|440-37030|     GEARY BLVD 3700| CREDIT CARD|2018-10-30 13:36:13|2018-10-30 13:49:33|              NS|           0.5|
|540-00100|         LAGUNA ST 0| PAY BY CELL|2018-10-05 09:42:00|2018-10-05 10:12:00|              AT|          1.12|
|700-12080|    VALENCIA ST 1200| PAY BY CELL|2018-10-12 

In [24]:
trans_end = transaction_df.join(parking_meters_tojoin,'POST_ID','left_outer')
trans_end.show(5)

+---------+---------------+------------+-------------------+-------------------+----------------+--------------+-------+----------+----------+----------+----------+
|  POST_ID|   STREET_BLOCK|PAYMENT_TYPE|   SESSION_START_DT|     SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|CAP_COL|METER_TYPE|SMART_METE|ACTIVESENS|ON_OFF_STR|
+---------+---------------+------------+-------------------+-------------------+----------------+--------------+-------+----------+----------+----------+----------+
|490-22190| IRVING ST 2200|        CASH|2018-10-19 11:00:01|2018-10-19 11:09:21|              NS|          0.35|   Grey|        SS|         N|         N|        ON|
|823-00160|  CHESTNUT ST 0|        CASH|2018-10-13 15:30:34|2018-10-13 16:33:25|              AT|          0.05|   Grey|        MS|         Y|         N|        ON|
|490-21250| IRVING ST 2100|        CASH|2018-10-29 12:40:30|2018-10-29 13:07:10|              NS|           1.0|   Grey|        SS|         N|         N|        ON|
|440-37030

## EDA

In [25]:
early_time = datetime.strptime('01-OCT-18 12.59.59 PM', '%d-%b-%y %I.%M.%S %p')
late_time = datetime.strptime('02-OCT-18 12.59.59 PM', '%d-%b-%y %I.%M.%S %p')

#     .filter((trans_end.SESSION_START_DT<late_time) & (trans_end.SESSION_START_DT<early_time))\

trans_end.filter((trans_end.POST_ID=='490-22190') & (trans_end.SESSION_START_DT<late_time))\
    .select(['STREET_BLOCK','SESSION_START_DT','SESSION_END_DT'])\
    .show()

KeyboardInterrupt: 

In [None]:
trans_end.filter(trans_end.POST_ID=='490-22190')\
    .select(['STREET_BLOCK','SESSION_START_DT','SESSION_END_DT'])\
    .show()

In [193]:
transaction_df.show(5)

+---------+---------------+------------+-------------------+-------------------+----------------+--------------+
|  POST_ID|   STREET_BLOCK|PAYMENT_TYPE|   SESSION_START_DT|     SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|
+---------+---------------+------------+-------------------+-------------------+----------------+--------------+
|490-22190| IRVING ST 2200|        CASH|2018-10-19 11:00:01|2018-10-19 11:09:21|              NS|          0.35|
|823-00160|  CHESTNUT ST 0|        CASH|2018-10-13 15:30:34|2018-10-13 16:33:25|              AT|          0.05|
|490-21250| IRVING ST 2100|        CASH|2018-10-29 12:40:30|2018-10-29 13:07:10|              NS|           1.0|
|440-37030|GEARY BLVD 3700| CREDIT CARD|2018-10-30 13:36:13|2018-10-30 13:49:33|              NS|           0.5|
|540-00100|    LAGUNA ST 0| PAY BY CELL|2018-10-05 09:42:00|2018-10-05 10:12:00|              AT|          1.12|
+---------+---------------+------------+-------------------+-------------------+----------------

In [194]:
transaction_df.printSchema()

root
 |-- POST_ID: string (nullable = true)
 |-- STREET_BLOCK: string (nullable = true)
 |-- PAYMENT_TYPE: string (nullable = true)
 |-- SESSION_START_DT: timestamp (nullable = true)
 |-- SESSION_END_DT: timestamp (nullable = true)
 |-- METER_EVENT_TYPE: string (nullable = true)
 |-- GROSS_PAID_AMT: double (nullable = true)



In [155]:
post_id_count =  transaction_df.select(['POST_ID','STREET_BLOCK']).distinct()\
            .groupBy('STREET_BLOCK').count()\
            .sort('count',ascending=False)
post_id_count.show(5)


+-------------+-----+
| STREET_BLOCK|count|
+-------------+-----+
| BLUXOME ST 0|  110|
|    MAIN ST 0|  100|
|   SPEAR ST 0|   96|
|BRYANT ST 400|   77|
| DRUMM ST 200|   71|
+-------------+-----+
only showing top 5 rows



In [26]:
revenue_df = transaction_df

In [27]:
def toWeekday(x):
    try:
        return x.weekday()
    except ValueError:
        return None
    
def toHour(x):
    try:
        return hour(x)
    except ValueError:
        return None

    
from pyspark.sql.functions import dayofweek,hour,dayofyear

toWeekday = udf(toWeekday)
revenue_df = revenue_df.withColumn('SESSION_START_DT_dof',dayofweek(revenue_df['SESSION_START_DT']))
revenue_df = revenue_df.withColumn('SESSION_START_DT_h',hour(revenue_df['SESSION_START_DT']))
revenue_df = revenue_df.withColumn('SESSION_END_DT_dof',dayofweek(revenue_df['SESSION_END_DT']))
revenue_df = revenue_df.withColumn('SESSION_END_DT_h',hour(revenue_df['SESSION_END_DT']))
revenue_df = revenue_df.withColumn('doy',dayofyear(revenue_df['SESSION_END_DT']))


In [28]:
revenue_df.show(2)

+---------+--------------+------------+-------------------+-------------------+----------------+--------------+--------------------+------------------+------------------+----------------+---+
|  POST_ID|  STREET_BLOCK|PAYMENT_TYPE|   SESSION_START_DT|     SESSION_END_DT|METER_EVENT_TYPE|GROSS_PAID_AMT|SESSION_START_DT_dof|SESSION_START_DT_h|SESSION_END_DT_dof|SESSION_END_DT_h|doy|
+---------+--------------+------------+-------------------+-------------------+----------------+--------------+--------------------+------------------+------------------+----------------+---+
|490-22190|IRVING ST 2200|        CASH|2018-10-19 11:00:01|2018-10-19 11:09:21|              NS|          0.35|                   6|                11|                 6|              11|292|
|823-00160| CHESTNUT ST 0|        CASH|2018-10-13 15:30:34|2018-10-13 16:33:25|              AT|          0.05|                   7|                15|                 7|              16|286|
+---------+--------------+------------+-

In [None]:
revenue_df.groupBy('STREET_BLOCK').count()

In [30]:
revenue_df.select('PAYMENT_TYPE').distinct().show()

KeyboardInterrupt: 

In [31]:
revenue_df.printSchema()

root
 |-- POST_ID: string (nullable = true)
 |-- STREET_BLOCK: string (nullable = true)
 |-- PAYMENT_TYPE: string (nullable = true)
 |-- SESSION_START_DT: timestamp (nullable = true)
 |-- SESSION_END_DT: timestamp (nullable = true)
 |-- METER_EVENT_TYPE: string (nullable = true)
 |-- GROSS_PAID_AMT: double (nullable = true)
 |-- SESSION_START_DT_dof: integer (nullable = true)
 |-- SESSION_START_DT_h: integer (nullable = true)
 |-- SESSION_END_DT_dof: integer (nullable = true)
 |-- SESSION_END_DT_h: integer (nullable = true)
 |-- doy: integer (nullable = true)



In [201]:
'SESSION_START_DT_h','SESSION_START_DT_dof'
revenue_df.groupBy('STREET_BLOCK','doy').agg(sum("GROSS_PAID_AMT"))

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [205]:
revenue_df.groupBy('STREET_BLOCK','doy').sum("GROSS_PAID_AMT").show(5)

KeyboardInterrupt: 

In [None]:
revenue_df.groupBy('STREET_BLOCK','doy','SESSION_START_DT_h','SESSION_START_DT_dof').sum("GROSS_PAID_AMT").show(5)

In [32]:
revenue_df.filter(revenue_df['STREET_BLOCK']=='IRVING ST 2200').groupBy('doy').sum("GROSS_PAID_AMT").show(5)

KeyboardInterrupt: 

In [None]:
revenue_df.filter(revenue_df['STREET_BLOCK']=='IRVING ST 2200').groupBy('doy').count().show(5)

In [33]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, date_format, col

anotherdf = revenue_df.groupBy('STREET_BLOCK','SESSION_START_DT_h','SESSION_START_DT_dof')\
    .agg(avg("GROSS_PAID_AMT").alias('revenue'),\
         count("*").alias('numOfTransaction'),\
         count(when(col('METER_EVENT_TYPE')=='NS', True)).alias('numNS'),\
         count(when(col('METER_EVENT_TYPE')=='AT', True)).alias('numAT'),\
         count(when(col('PAYMENT_TYPE')=='CASH', True)).alias('numCASH'),\
         count(when(col('PAYMENT_TYPE')=='CREDIT CARD', True)).alias('numCC'),\
         count(when(col('PAYMENT_TYPE')=='PAY BY CELL', True)).alias('numPhone'),\
         count(when(col('PAYMENT_TYPE')=='SMART CARD', True)).alias('numSmartCard'))\
    .sort('revenue',ascending=False)

In [34]:
anotherdf.show(5)

+------------------+------------------+--------------------+------------------+----------------+-----+-----+-------+-----+--------+------------+
|      STREET_BLOCK|SESSION_START_DT_h|SESSION_START_DT_dof|           revenue|numOfTransaction|numNS|numAT|numCASH|numCC|numPhone|numSmartCard|
+------------------+------------------+--------------------+------------------+----------------+-----+-----+-------+-----+--------+------------+
|      SOUTH PARK 0|                 7|                   2|41.583333333333336|               3|    3|    0|      0|    3|       0|           0|
|      SOUTH PARK 0|                 7|                   3|             35.75|               1|    1|    0|      0|    1|       0|           0|
|WASHINGTON ST 2400|                 7|                   4|             32.25|               1|    1|    0|      0|    1|       0|           0|
|  FILLMORE ST 2100|                 6|                   4|             29.25|               1|    1|    0|      0|    1|       0

In [223]:
anotherdf.select('STREET_BLOCK').distinct().count()

1498

In [35]:
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num",stringOrderType="alphabetDesc")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

stringindex_anotherdf = indexStringColumns(anotherdf, ['STREET_BLOCK'])

In [36]:
from pyspark.ml.feature import OneHotEncoder
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

onehot_anotherdf = oneHotEncodeColumns(stringindex_anotherdf, ['STREET_BLOCK'])

In [37]:
onehot_anotherdf.show(1)

+------------------+--------------------+------------------+----------------+-----+-----+-------+-----+--------+------------+------------------+
|SESSION_START_DT_h|SESSION_START_DT_dof|           revenue|numOfTransaction|numNS|numAT|numCASH|numCC|numPhone|numSmartCard|      STREET_BLOCK|
+------------------+--------------------+------------------+----------------+-----+-----+-------+-----+--------+------------+------------------+
|                 7|                   2|41.583333333333336|               3|    3|    0|      0|    3|       0|           0|(1498,[192],[1.0])|
+------------------+--------------------+------------------+----------------+-----+-----+-------+-----+--------+------------+------------------+
only showing top 1 row



In [39]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=["SESSION_START_DT_h", "SESSION_START_DT_dof", "revenue","numNS","numAT","numCASH","numCC","numPhone","numSmartCard","STREET_BLOCK"],
    outputCol="features")

output = assembler.transform(onehot_anotherdf)

# va = VectorAssembler(outputCol="features", inputCols=anotherdf.columns[1:]) #except the last col.
penlpoints = output.select("features", "numOfTransaction")
penlpoints.show(3)

+--------------------+----------------+
|            features|numOfTransaction|
+--------------------+----------------+
|(1507,[0,1,2,3,6,...|               3|
|(1507,[0,1,2,3,6,...|               1|
|(1507,[0,1,2,3,6,...|               1|
+--------------------+----------------+
only showing top 3 rows



In [None]:
# endpenlpoints = penlpoints.select("features", $"numOfTransaction".alias('label'))
endpenlpoints = penlpoints.withColumnRenamed("numOfTransaction","label")
endpenlpoints.show(3)

In [40]:
splits = penlpoints.randomSplit([0.8, 0.2])

#cache() : the algorithm is interative and training and data sets are going to be reused many times.
train = splits[0].cache()
test = splits[1].cache()

In [238]:
?DecisionTreeClassifier

In [41]:
# Train the data.
# from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.regression import RandomForestRegressor
# Paramenters
#maxDepth : maximum tree depth (default : 5).
#maxBins : maximum number of bins when binning continuous features (default : 32).
#minInstancesPerNode : minimum number of dataset samples each branch needs to have after a split (default : 1).
#minInfoGain : minimum information gain for a split (default : 0).
dt = DecisionTreeClassifier(maxDepth=20, maxBins= 32, minInstancesPerNode=1, minInfoGain = 0, labelCol="numOfTransaction")
dtmodel = dt.fit(train)

KeyboardInterrupt: 

In [None]:
#Test data.
dtpredicts = dtmodel.transform(test)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="numOfTransaction", predictionCol="prediction", metricName="rmse")
accuracy = evaluator.evaluate(dtpredicts)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
# ml.evaluation.RegressionEvaluator
# val regEval = new RegressionEvaluator().
#   setMetricName("r2").
#   setPredictionCol("prediction").
#   setLabelCol("label")

In [42]:
# Train the model.
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(maxDepth=10, labelCol="numOfTransaction")
rfmodel = rf.fit(train)

Py4JJavaError: An error occurred while calling o490.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 122.0 failed 1 times, most recent failure: Lost task 2.0 in stage 122.0 (TID 7218, localhost, executor driver): java.lang.OutOfMemoryError: GC overhead limit exceeded
	at java.lang.Integer.valueOf(Integer.java:832)
	at scala.runtime.BoxesRunTime.boxToInteger(BoxesRunTime.java:65)
	at scala.collection.mutable.ArrayOps$ofInt.apply(ArrayOps.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofInt.foreach(ArrayOps.scala:234)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofInt.map(ArrayOps.scala:234)
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:54)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:541)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:537)
	at scala.Array$.tabulate(Array.scala:331)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:537)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:534)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:743)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$collectAsMap$1.apply(PairRDDFunctions.scala:742)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:742)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:563)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:198)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:130)
	at org.apache.spark.ml.regression.RandomForestRegressor.train(RandomForestRegressor.scala:45)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at java.lang.Integer.valueOf(Integer.java:832)
	at scala.runtime.BoxesRunTime.boxToInteger(BoxesRunTime.java:65)
	at scala.collection.mutable.ArrayOps$ofInt.apply(ArrayOps.scala:234)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofInt.foreach(ArrayOps.scala:234)
	at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
	at scala.collection.mutable.ArrayOps$ofInt.map(ArrayOps.scala:234)
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:54)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:541)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12$$anonfun$13.apply(RandomForest.scala:537)
	at scala.Array$.tabulate(Array.scala:331)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:537)
	at org.apache.spark.ml.tree.impl.RandomForest$$anonfun$12.apply(RandomForest.scala:534)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
rfpred = rfmodel.transform(test)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="numOfTransaction", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(rfpred)
print(rmse)

In [None]:
wanted_df = revenue_df.groupBy("block","dow","starttime_hour")\
                      .agg(avg('revenue').alias('avgRevenue'), 
                      count('starttime_hour').alias('numNewTrans'),
                      avg('timeDiff').alias('avgDur'),
                      count(when(col('meterType')=='NS', True)).alias('numNS'),
                      count(when(col('meterType')=='AT', True)).alias('numAT'),
                      count(when(col('payType')=='CASH', True)).alias('numCASH'),
                      count(when(col('payType')=='CREDIT CARD', True)).alias('numCC'),
                      count(when(col('payType')=='PAY BY CELL', True)).alias('numPhone'),
                      count(when(col('payType')=='SMART CARD', True)).alias('numSmartCard'))

In [None]:
wanted_df.show(5)

In [None]:
wanted_df_derv = revenue_df.groupBy("block","dow","endtime_hour").count()
wanted_df_derv = wanted_df_derv.withColumnRenamed('block', 'block_derv')
wanted_df_derv = wanted_df_derv.withColumnRenamed('dow', 'dow_derv')
wanted_df_derv.sort("dow","endtime_hour", ascending=[True, True]).show()

In [None]:
wanted_df_derv.show(5)

In [None]:
wanted_df = wanted_df.join(wanted_df_derv, 
                           (wanted_df.block == wanted_df_derv.block_derv) &
                           (wanted_df.dow == wanted_df_derv.dow_derv) &
                           (wanted_df.starttime_hour == wanted_df_derv.endtime_hour)).drop('endtime_hour')


In [None]:
wanted_df = wanted_df.withColumnRenamed('count','numEndTrans')
wanted_df = wanted_df.withColumn('turnover', wanted_df['numNewTrans'] - wanted_df['numEndTrans'])

In [None]:
wanted_df = wanted_df.drop('block_derv','dow_derv').sort("block", "dow","starttime_hour", ascending=[True, True])
wanted_df.drop('block','avgRevenue','avgDur').show()

In [3]:
df = pd.read_csv('output1month.csv')
df.head()

Unnamed: 0,POST_ID,STREET_BLOCK,PAYMENT_TYPE,SESSION_START_DT,SESSION_END_DT,METER_EVENT_TYPE,GROSS_PAID_AMT
0,490-22190,IRVING ST 2200,CASH,19-OCT-18 11.00.01 AM,19-OCT-18 11.09.21 AM,NS,0.35
1,823-00160,CHESTNUT ST 0,CASH,13-OCT-18 03.30.34 PM,13-OCT-18 04.33.25 PM,AT,0.05
2,490-21250,IRVING ST 2100,CASH,29-OCT-18 12.40.30 PM,29-OCT-18 01.07.10 PM,NS,1.0
3,440-37030,GEARY BLVD 3700,CREDIT CARD,30-OCT-18 01.36.13 PM,30-OCT-18 01.49.33 PM,NS,0.5
4,540-00100,LAGUNA ST 0,PAY BY CELL,05-OCT-18 09.42.00 AM,05-OCT-18 10.12.00 AM,AT,1.12


In [27]:
parking_meters = pd.read_csv('Parking_meters.csv')
print(len(parking_meters.columns))
parking_meters.head()

16


Unnamed: 0,POST_ID,MS_ID,MS_SPACEID,CAP_COLOR,METER_TYPE,SMART_METE,ACTIVESENS,JURISDICTI,ON_OFF_STR,OSP_ID,STREET_NUM,STREETNAME,STREET_SEG,RATEAREA,SFPARKAREA,LOCATION
0,401-06340,-,0,Grey,SS,N,N,SFMTA,ON,0,634,ELLIS ST,5177000,Area 3,,"(37.78436, -122.41724)"
1,104-03190,-,0,Grey,SS,N,Y,SFMTA,ON,0,319,04TH AVE,241000,Area 3,,"(37.78263, -122.46232)"
2,352-04350,-,0,Grey,SS,N,N,SFMTA,ON,0,435,CASTRO ST,3790000,Area 3,,"(37.7614, -122.43495)"
3,116-03980,-,0,Grey,SS,N,N,SFMTA,ON,0,398,16TH AVE,672000,Area 3,,"(37.7808, -122.47505)"
4,224-27570,-,0,Grey,SS,N,N,SFMTA,ON,0,2757,24TH ST,1331000,Area 3,,"(37.75283, -122.40763)"


In [6]:
Meter_Rate_Schedules = pd.read_csv('Meter_Rate_Schedules.csv')
Meter_Rate_Schedules.head()

Unnamed: 0,Post ID,Schedule Priority,Days Applied,From Time,To Time,Rate Type,Rate
0,102-02990,1,,,,Base Rate,2.0
1,102-03890,1,,,,Base Rate,2.0
2,102-03900,1,,,,Base Rate,2.0
3,102-03910,1,,,,Base Rate,2.0
4,102-03920,1,,,,Base Rate,2.0


In [158]:
parking_meters.filter()

TypeError: filter() missing 1 required positional argument: 'condition'