As of now, forecasting minute data seems like a binary time series prediction task which prophet doesnt support. Implement this later using other algorithms.

In [1]:
import os
import findspark
findspark.init()
import pyspark
from pyspark import SQLContext
sc = pyspark.SparkContext(master='spark://192.168.11.239:7077', appName='data_cleaner')

In [2]:
sqlContext = SQLContext(sc)

In [3]:
from datetime import datetime
import pyspark.sql.functions as F #avoid conflicts with regular python functions
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType
import pandas as pd
from fbprophet import Prophet
import matplotlib.pyplot as plt
from fbprophet.plot import plot_plotly
import plotly.offline as py

In [4]:
df = sqlContext.read.csv("/datasets/district11.csv", header='true')

In [9]:
# sc.stop()

In [None]:
# dfX = df.select('X Coordinate', df["X Coordinate"].cast('float').alias('X'))
# dfX.select("X").rdd.max()[0]

In [None]:
df = (df
       .withColumn('Timestamps', F.to_timestamp("Date", 'MM/dd/yyyy hh:mm:ss a'))
      )

In [None]:
df.select("Timestamps").show(5) 

In [22]:
pd_df = df.groupBy("Timestamps").count().toPandas().sort_values(by = "Timestamps").reset_index(drop=True)

In [23]:
pd_df

Unnamed: 0,Timestamps,count
0,2001-01-01 00:00:00,11
1,2001-01-01 00:01:00,7
2,2001-01-01 00:05:00,1
3,2001-01-01 00:15:00,2
4,2001-01-01 00:55:00,1
...,...,...
390864,2020-01-30 18:57:00,1
390865,2020-01-30 19:17:00,1
390866,2020-01-30 19:21:00,1
390867,2020-01-30 19:30:00,1


In [24]:
pd_df.dtypes

Timestamps    datetime64[ns]
count                  int64
dtype: object

In [25]:
pd_df = pd_df.rename(columns={'Timestamps': 'ds', 'count': 'y'})
pd_df.head()

Unnamed: 0,ds,y
0,2001-01-01 00:00:00,11
1,2001-01-01 00:01:00,7
2,2001-01-01 00:05:00,1
3,2001-01-01 00:15:00,2
4,2001-01-01 00:55:00,1


In [26]:
#minute_freq takes a datatime as input and converts it to to closest freq minute
def minute_freq(x,freq):
    converted_minute = (freq * round(float(x.minute/freq)))
    if converted_minute > 0 and converted_minute%60 == 0: #Rounded up to next hour
        return x.replace(hour=(int(x.hour)+1)%24,minute=converted_minute%60,second=0)
    else:
        return x.replace(minute=converted_minute%60,second=0)

Convert data to every 10th minute

In [67]:
pd_train = pd_df.copy()
pd_train.ds = pd_train.ds.apply(lambda x: minute_freq(x,10))
pd_train.head(10)

Unnamed: 0,ds,y
0,2001-01-01 00:00:00,11
1,2001-01-01 00:00:00,7
2,2001-01-01 00:00:00,1
3,2001-01-01 00:20:00,2
4,2001-01-01 01:00:00,1
5,2001-01-01 01:00:00,1
6,2001-01-01 01:40:00,1
7,2001-01-01 02:00:00,1
8,2001-01-01 02:20:00,1
9,2001-01-01 02:30:00,1


Aggregate the count for evert 10th minute

In [68]:
pd_train = pd_train.groupby(["ds"],as_index=False).sum()
pd_train.head(10)

Unnamed: 0,ds,y
0,2001-01-01 00:00:00,19
1,2001-01-01 00:20:00,2
2,2001-01-01 01:00:00,2
3,2001-01-01 01:40:00,1
4,2001-01-01 02:00:00,1
5,2001-01-01 02:20:00,1
6,2001-01-01 02:30:00,1
7,2001-01-01 03:00:00,3
8,2001-01-01 03:40:00,1
9,2001-01-01 04:00:00,2


Upsample the time intervals. Some of them has no data, then fill them with 0

In [69]:
pd_train.head(10)

Unnamed: 0,ds,y
0,2001-01-01 00:00:00,19
1,2001-01-01 00:20:00,2
2,2001-01-01 01:00:00,2
3,2001-01-01 01:40:00,1
4,2001-01-01 02:00:00,1
5,2001-01-01 02:20:00,1
6,2001-01-01 02:30:00,1
7,2001-01-01 03:00:00,3
8,2001-01-01 03:40:00,1
9,2001-01-01 04:00:00,2


In [70]:
pd_train = pd_train.resample('10min', on='ds').sum().reset_index()

Train a model based on the new data

In [71]:
len(pd_train)

1003519

In [73]:
m = Prophet(interval_width=0.65)
m.fit(pd_train[:20000])
future = m.make_future_dataframe(periods=10000, freq='10min')
fcst = m.predict(future)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.

Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.



In [74]:
fcst.yhat.apply(lambda x: int(x)).value_counts()

0    30000
Name: yhat, dtype: int64

In [72]:
pd_train[:20000].y.value_counts()

0     12880
1      5018
2      1542
3       415
4       103
5        21
6        10
8         4
7         2
12        1
11        1
10        1
9         1
19        1
Name: y, dtype: int64

In [86]:
forecasted_data = {'time':fcst.ds[4000:],'predicted': fcst.yhat[4000:].apply(lambda x: int(x)), 'true': pd_train.y[4000:(len(fcst))]}
df_pred = pd.DataFrame(data=forecasted_data)
df_pred[:20]

Unnamed: 0,time,predicted,true
4000,2001-03-14 18:15:00,1,1
4001,2001-03-14 18:25:00,1,1
4002,2001-03-14 18:35:00,1,3
4003,2001-03-14 18:45:00,1,1
4004,2001-03-14 18:55:00,1,1
4005,2001-03-14 19:05:00,1,1
4006,2001-03-14 19:15:00,1,1
4007,2001-03-14 19:25:00,1,1
4008,2001-03-14 19:35:00,1,1
4009,2001-03-14 19:45:00,1,1
