# Muni Shaming: Machine Learning Component

The code here cleans the muni data, and performs machine learning on the data

 ### Data Preprocessing

In [72]:
import pandas as pd
import json
import numpy as np

In [73]:
# convert the jason file to a python dictionary

muni_file = "MUNI-Stop-prediction.json"
with open(muni_file,"r") as f:
    muni_json = json.load(f)

In [74]:
# capture metadata

ResponseTimestamp = muni_json["ServiceDelivery"]["ResponseTimestamp"]
ProducerRef = muni_json["ServiceDelivery"]["ProducerRef"]
Status = muni_json["ServiceDelivery"]["Status"]
version = muni_json["ServiceDelivery"]["StopMonitoringDelivery"]["version"]

In [75]:
# create a dataframe from a dictionary from within the python dictionary, using a normalize function

from pandas.io.json import json_normalize

MonitoredStopVisit_json = muni_json["ServiceDelivery"]["StopMonitoringDelivery"]["MonitoredStopVisit"]
muni_df = pd.DataFrame.from_dict(json_normalize(MonitoredStopVisit_json), orient='columns')

muni_df.head()

Unnamed: 0,MonitoredVehicleJourney.Bearing,MonitoredVehicleJourney.DestinationName,MonitoredVehicleJourney.DestinationRef,MonitoredVehicleJourney.DirectionRef,MonitoredVehicleJourney.FramedVehicleJourneyRef.DataFrameRef,MonitoredVehicleJourney.FramedVehicleJourneyRef.DatedVehicleJourneyRef,MonitoredVehicleJourney.InCongestion,MonitoredVehicleJourney.LineRef,MonitoredVehicleJourney.Monitored,MonitoredVehicleJourney.MonitoredCall.AimedArrivalTime,...,MonitoredVehicleJourney.Occupancy,MonitoredVehicleJourney.OperatorRef,MonitoredVehicleJourney.OriginName,MonitoredVehicleJourney.OriginRef,MonitoredVehicleJourney.PublishedLineName,MonitoredVehicleJourney.VehicleLocation.Latitude,MonitoredVehicleJourney.VehicleLocation.Longitude,MonitoredVehicleJourney.VehicleRef,MonitoringRef,RecordedAtTime
0,,Paul + Third Street,14648,Outbound,2019-07-18,8775761,,29,True,2019-07-18T22:18:10Z,...,,SF,Bowley St & Lincoln Blvd,13706,SUNSET,37.7630653,-122.496071,8833,16531,2019-07-18T22:22:33Z
1,,Potrero + 25th Street,13511,Outbound,2019-07-18,8780948,,33,True,2019-07-18T22:25:38Z,...,,SF,Sacramento St & Cherry St,16293,ASHBURY-18TH ST,37.7611389,-122.430641,8157,13323,2019-07-18T22:22:33Z
2,,Munich + Geneva,15631,Outbound,2019-07-18,8789016,,43,True,2019-07-18T22:28:27Z,...,,SF,Marina Blvd & Laguna St,14729,MASONIC,37.769268,-122.450745,8960,14092,2019-07-18T22:22:33Z
3,,Munich + Geneva,15631,Outbound,2019-07-18,8788980,,43,True,2019-07-18T22:24:00Z,...,,SF,Marina Blvd & Laguna St,14729,MASONIC,37.7985458,-122.446899,8966,15293,2019-07-18T22:22:33Z
4,,Drumm + Clay,14015,Inbound,2019-07-18,8745972,,1,True,2019-07-18T22:22:48Z,...,,SF,Geary Blvd & 33rd Ave,14277,CALIFORNIA,37.7798538,-122.493126,5791,13555,2019-07-18T22:22:33Z


In [76]:
# append the metadata to the dataframe
muni_df["ResponseTimestamp"] = ResponseTimestamp
muni_df["ProducerRef"] = ProducerRef
muni_df["Status"] = Status
muni_df["version"] = version

In [77]:
muni_df

Unnamed: 0,MonitoredVehicleJourney.Bearing,MonitoredVehicleJourney.DestinationName,MonitoredVehicleJourney.DestinationRef,MonitoredVehicleJourney.DirectionRef,MonitoredVehicleJourney.FramedVehicleJourneyRef.DataFrameRef,MonitoredVehicleJourney.FramedVehicleJourneyRef.DatedVehicleJourneyRef,MonitoredVehicleJourney.InCongestion,MonitoredVehicleJourney.LineRef,MonitoredVehicleJourney.Monitored,MonitoredVehicleJourney.MonitoredCall.AimedArrivalTime,...,MonitoredVehicleJourney.PublishedLineName,MonitoredVehicleJourney.VehicleLocation.Latitude,MonitoredVehicleJourney.VehicleLocation.Longitude,MonitoredVehicleJourney.VehicleRef,MonitoringRef,RecordedAtTime,ResponseTimestamp,ProducerRef,Status,version
0,,Paul + Third Street,14648,Outbound,2019-07-18,8775761,,29,True,2019-07-18T22:18:10Z,...,SUNSET,37.7630653,-122.496071,8833,16531,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
1,,Potrero + 25th Street,13511,Outbound,2019-07-18,8780948,,33,True,2019-07-18T22:25:38Z,...,ASHBURY-18TH ST,37.7611389,-122.430641,8157,13323,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
2,,Munich + Geneva,15631,Outbound,2019-07-18,8789016,,43,True,2019-07-18T22:28:27Z,...,MASONIC,37.769268,-122.450745,8960,14092,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
3,,Munich + Geneva,15631,Outbound,2019-07-18,8788980,,43,True,2019-07-18T22:24:00Z,...,MASONIC,37.7985458,-122.446899,8966,15293,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
4,,Drumm + Clay,14015,Inbound,2019-07-18,8745972,,1,True,2019-07-18T22:22:48Z,...,CALIFORNIA,37.7798538,-122.493126,5791,13555,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
5,,Ferry Plaza,16497,Inbound,2019-07-18,8765671,,21,True,2019-07-18T22:20:00Z,...,HAYES,37.778595,-122.414963,5755,15650,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
6,,Daly City BART,17925,Outbound,2019-07-18,8843178,,28,True,2019-07-18T22:19:16Z,...,19TH AVENUE,37.8011017,-122.426758,8921,15280,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
7,,Temporary Transbay Terminal,17916,Inbound,2019-07-18,8782862,,38,True,2019-07-18T22:24:08Z,...,GEARY,37.7807846,-122.467186,6593,14258,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
8,,Great Highway,14781,Outbound,2019-07-18,8793726,,48,True,2019-07-18T22:21:17Z,...,QUINTARA-24TH STREET,37.7512207,-122.435631,8661,13468,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4
9,,City College,15926,Outbound,2019-07-18,8794933,,49,True,2019-07-18T22:22:00Z,...,VAN NESS-MISSION,37.8048096,-122.425369,6615,16801,2019-07-18T22:22:33Z,2019-07-18T22:22:44Z,SF,True,1.4


In [78]:
import datetime

In [79]:
if 'Scheduled Arrival Time' in muni_df.columns:
    muni_df.drop(["Scheduled Arrival Time"], axis  = 1)

if 'Actual Arrival Time' in muni_df.columns:
    muni_df.drop(["Actual Arrival Time"], axis  = 1)

muni_df["Scheduled Arrival Time"] = pd.to_datetime(muni_df["MonitoredVehicleJourney.MonitoredCall.AimedArrivalTime"])
muni_df["Actual Arrival Time"] = pd.to_datetime(muni_df["MonitoredVehicleJourney.MonitoredCall.ExpectedArrivalTime"])

In [80]:
if 'time late' in muni_df.columns:
    muni_df.drop(["time late"], axis = 1)

if 'time early' in muni_df.columns:
    muni_df.drop(["time early"], axis = 1)

In [81]:
muni_df["time early"] = muni_df["Scheduled Arrival Time"] - muni_df["Actual Arrival Time"]
muni_df["time late"] = muni_df["Actual Arrival Time"] - muni_df["Scheduled Arrival Time"]

In [82]:
muni_df["minutes early/(late)"] = muni_df["time early"]/np.timedelta64(1,"m")
muni_df["minutes_early"] = muni_df["time early"]/np.timedelta64(1,"m")

In [83]:
muni_df["weekday"] = muni_df["Scheduled Arrival Time"].dt.dayofweek
muni_df["hour"] = muni_df["Scheduled Arrival Time"].dt.hour
muni_df["minute"] = muni_df["Scheduled Arrival Time"].dt.minute
muni_df["time"] = muni_df["hour"] + muni_df["minute"]/60

In [84]:
muni_df

Unnamed: 0,MonitoredVehicleJourney.Bearing,MonitoredVehicleJourney.DestinationName,MonitoredVehicleJourney.DestinationRef,MonitoredVehicleJourney.DirectionRef,MonitoredVehicleJourney.FramedVehicleJourneyRef.DataFrameRef,MonitoredVehicleJourney.FramedVehicleJourneyRef.DatedVehicleJourneyRef,MonitoredVehicleJourney.InCongestion,MonitoredVehicleJourney.LineRef,MonitoredVehicleJourney.Monitored,MonitoredVehicleJourney.MonitoredCall.AimedArrivalTime,...,Scheduled Arrival Time,Actual Arrival Time,time early,time late,minutes early/(late),minutes_early,weekday,hour,minute,time
0,,Paul + Third Street,14648,Outbound,2019-07-18,8775761,,29,True,2019-07-18T22:18:10Z,...,2019-07-18 22:18:10+00:00,2019-07-18 22:22:45+00:00,-1 days +23:55:25,00:04:35,-4.583333,-4.583333,3,22,18,22.300000
1,,Potrero + 25th Street,13511,Outbound,2019-07-18,8780948,,33,True,2019-07-18T22:25:38Z,...,2019-07-18 22:25:38+00:00,2019-07-18 22:22:45+00:00,00:02:53,-1 days +23:57:07,2.883333,2.883333,3,22,25,22.416667
2,,Munich + Geneva,15631,Outbound,2019-07-18,8789016,,43,True,2019-07-18T22:28:27Z,...,2019-07-18 22:28:27+00:00,2019-07-18 22:22:45+00:00,00:05:42,-1 days +23:54:18,5.700000,5.700000,3,22,28,22.466667
3,,Munich + Geneva,15631,Outbound,2019-07-18,8788980,,43,True,2019-07-18T22:24:00Z,...,2019-07-18 22:24:00+00:00,2019-07-18 22:22:45+00:00,00:01:15,-1 days +23:58:45,1.250000,1.250000,3,22,24,22.400000
4,,Drumm + Clay,14015,Inbound,2019-07-18,8745972,,1,True,2019-07-18T22:22:48Z,...,2019-07-18 22:22:48+00:00,2019-07-18 22:22:46+00:00,00:00:02,-1 days +23:59:58,0.033333,0.033333,3,22,22,22.366667
5,,Ferry Plaza,16497,Inbound,2019-07-18,8765671,,21,True,2019-07-18T22:20:00Z,...,2019-07-18 22:20:00+00:00,2019-07-18 22:22:46+00:00,-1 days +23:57:14,00:02:46,-2.766667,-2.766667,3,22,20,22.333333
6,,Daly City BART,17925,Outbound,2019-07-18,8843178,,28,True,2019-07-18T22:19:16Z,...,2019-07-18 22:19:16+00:00,2019-07-18 22:22:46+00:00,-1 days +23:56:30,00:03:30,-3.500000,-3.500000,3,22,19,22.316667
7,,Temporary Transbay Terminal,17916,Inbound,2019-07-18,8782862,,38,True,2019-07-18T22:24:08Z,...,2019-07-18 22:24:08+00:00,2019-07-18 22:22:46+00:00,00:01:22,-1 days +23:58:38,1.366667,1.366667,3,22,24,22.400000
8,,Great Highway,14781,Outbound,2019-07-18,8793726,,48,True,2019-07-18T22:21:17Z,...,2019-07-18 22:21:17+00:00,2019-07-18 22:22:46+00:00,-1 days +23:58:31,00:01:29,-1.483333,-1.483333,3,22,21,22.350000
9,,City College,15926,Outbound,2019-07-18,8794933,,49,True,2019-07-18T22:22:00Z,...,2019-07-18 22:22:00+00:00,2019-07-18 22:22:46+00:00,-1 days +23:59:14,00:00:46,-0.766667,-0.766667,3,22,22,22.366667


In [85]:
list(muni_df.columns.values)

['MonitoredVehicleJourney.Bearing',
 'MonitoredVehicleJourney.DestinationName',
 'MonitoredVehicleJourney.DestinationRef',
 'MonitoredVehicleJourney.DirectionRef',
 'MonitoredVehicleJourney.FramedVehicleJourneyRef.DataFrameRef',
 'MonitoredVehicleJourney.FramedVehicleJourneyRef.DatedVehicleJourneyRef',
 'MonitoredVehicleJourney.InCongestion',
 'MonitoredVehicleJourney.LineRef',
 'MonitoredVehicleJourney.Monitored',
 'MonitoredVehicleJourney.MonitoredCall.AimedArrivalTime',
 'MonitoredVehicleJourney.MonitoredCall.AimedDepartureTime',
 'MonitoredVehicleJourney.MonitoredCall.Distances',
 'MonitoredVehicleJourney.MonitoredCall.ExpectedArrivalTime',
 'MonitoredVehicleJourney.MonitoredCall.ExpectedDepartureTime',
 'MonitoredVehicleJourney.MonitoredCall.StopPointName',
 'MonitoredVehicleJourney.MonitoredCall.StopPointRef',
 'MonitoredVehicleJourney.MonitoredCall.VehicleAtStop',
 'MonitoredVehicleJourney.MonitoredCall.VehicleLocationAtStop',
 'MonitoredVehicleJourney.Occupancy',
 'MonitoredVeh

In [86]:
weekday = set(muni_df.weekday.values)
stop = set(muni_df["MonitoredVehicleJourney.MonitoredCall.StopPointRef"].values)
line = set(muni_df["MonitoredVehicleJourney.LineRef"].values)

# print(weekday)
# print(stop)
print(line)

{'39', '33', '31BX', '2', 'N', '5R', '10', '18', '8', 'PH', '14', '31', '14R', '9R', '8BX', '12', '28', '57', '23', '21', '28R', '22', '38R', 'M', 'C', '1', '41', '45', 'NX', '25', '30X', 'L', '7', '48', '19', '3', '47', '52', 'F', '9', '55', 'PM', '36', '14X', '29', '38AX', 'J', '30', '35', '43', '66', '49', '67', '27', 'S', '7X', '38BX', '6', '38', '5', '1BX', '44', '56', '54', '31AX', '37', 'E', '24', 'KT', '1AX', '8AX'}


In [87]:
selected_weekday = 3
# seleted_stop = 
selected_line = 'N'

 ### Creating our Model
 For numerical data, we are selecting the regressor model

### Defining our Model Architecture (the layers)

### Model Summary

### Compile the Model

### Training the Model

### Quantify the Data

### Making predictions with New Data