## Predicting traffic on the traffic tracker dataset

In [7]:
import findspark
findspark.init()
import pyspark
from pyspark import SQLContext
from pyspark import SparkContext

SparkContext.setSystemProperty('spark.executor.memory', '2400m')
SparkContext.setSystemProperty('spark.driver.cores', '2')
SparkContext.setSystemProperty('spark.driver.memory', '6g')

sc = pyspark.SparkContext(master='spark://192.168.11.239:7077', appName='predict_trafic')
sqlContext = SQLContext(sc)

In [8]:
from pyspark.sql.types import StringType, IntegerType
from datetime import datetime
import pyspark.sql.functions as F #avoid conflicts with regular python functions
from pyspark.sql.functions import udf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from fbprophet import Prophet
# import org.apache.spark.sql.types.IntegerType

In [None]:
df = sqlContext.read.csv("/datasets/ttracker1309.csv", header='true')
#Define date derivatives
df = (df
      .withColumn('Date', F.to_date("Time", 'MM/dd/yyyy hh:mm:ss a'))
      .withColumn('SpeedValues', df["SPEED"].cast('float'))
     )

In [None]:
# df.groupBy("SEGMENTID").count()

In [None]:
pd_df = df.groupBy("Date").mean("SpeedValues").toPandas().sort_values(by = "Date").reset_index(drop=True)

In [None]:
pd_df.head()

In [None]:
pd_df["Date"] = pd.DatetimeIndex(pd_df["Date"])
pd_df = pd_df.rename(columns={'Date': 'ds', 'avg(SpeedValues)': 'y'})

In [None]:
m = Prophet(mcmc_samples=300)
m.fit(pd_df[:-60])
future = m.make_future_dataframe(
    periods=60,
    freq='d',
    include_history=True
)
fcst = m.predict(future)

In [None]:
forecasted_data = {'predicted': fcst.yhat[-60:].apply(lambda x: int(x)), 'true': pd_df.y[-60:]}
df_pred = pd.DataFrame(data=forecasted_data)
df_pred.head(50)

In [None]:
df_pred.head(50)

In [None]:
import numpy as np

In [None]:
forecast_error = (df_pred.predicted - df_pred.true).values
mean_forecast_error = np.mean(forecast_error)
mean_absolute_error = np.mean(np.abs(forecast_error))
mean_squared_error = np.mean(forecast_error**2)
rmse = np.sqrt(mean_squared_error)

In [None]:
print("Forecast error: ", forecast_error)
print("Bias: ",mean_forecast_error)
print("MAE: ",mean_absolute_error)
print("MSE: ",mean_squared_error)
print("RMSE: ",rmse)

In [None]:
ax = df_pred.plot(figsize=(10,7), title="Predicted traffic speed of segment 1309")
ax.set_ylabel("Speed of traffic(mph)");

In [5]:
sc.stop()

In [15]:
df_pred

Unnamed: 0,predicted,true
72,21,20.222222
73,21,21.363636
74,22,20.181818
75,21,24.857143
76,21,21.636364
77,21,20.375
78,21,20.857143
79,21,21.333333
80,21,24.142857
81,22,21.285714


In [6]:
sc.stop()