In [4]:
import pandas as pd
from matplotlib import pyplot


In [5]:
training_data = pd.read_csv('/content/train_E1GspfA.csv')
training_data.shape

(18247, 3)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
training_data.head()

Unnamed: 0,date,hour,demand
0,2018-08-18,9,91
1,2018-08-18,10,21
2,2018-08-18,13,23
3,2018-08-18,14,104
4,2018-08-18,15,81


In [8]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18247 entries, 0 to 18246
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    18247 non-null  object
 1   hour    18247 non-null  int64 
 2   demand  18247 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 427.8+ KB


The training data has no null values, every data point has value for every feature

The _date_ feature has object as its data type, which needs to be converted to DateTime

In [9]:
training_data['date'] = pd.to_datetime(training_data['date'])

Training data contains data point in the time range :

In [10]:
print(f"starting date : {str(training_data['date'].dt.date.min())}")
print(f"end date : {str(training_data['date'].dt.date.max())}")

starting date : 2018-08-18
end date : 2021-02-28


Instead of having _hour_ as separate frature/column, _date_ and _hour_ can be combined to form a timestamp

In [11]:
def dataPreprocessing(dataFrame):
    dataFrame['date'] = pd.to_datetime(dataFrame['date']) + dataFrame['hour'].astype('timedelta64[h]')
    dataFrame.drop(columns=['hour'], axis=1, inplace=True)
    return dataFrame

Dropping the non required column : _hour_

In [12]:
training_data = dataPreprocessing(training_data)
training_data.head()

Unnamed: 0,date,demand
0,2018-08-18 09:00:00,91
1,2018-08-18 10:00:00,21
2,2018-08-18 13:00:00,23
3,2018-08-18 14:00:00,104
4,2018-08-18 15:00:00,81


# Exploratory Data Analysis

In [13]:
import plotly.express as px

In [14]:
fig = px.line(training_data, x='date', y='demand')

fig.update_xaxes(rangeslider_visible=True)
fig.show()

# splitting training data in tain and validation set

In [15]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [16]:
training_data.rename(columns={'date': 'ds', 'demand': 'y'}, inplace=True)
train_data = training_data.sample(frac=0.8, random_state=10)

validation_data = training_data.drop(train_data.index)

print(f'training data size : {train_data.shape}')
print(f'validation data size : {validation_data.shape}')

train_data = train_data.reset_index()
validation_data = validation_data.reset_index()

training data size : (14598, 2)
validation data size : (3649, 2)


# Prediction Models

importing required libraries

In [17]:
from sklearn.metrics import mean_absolute_error
from fbprophet import Prophet

fitting the model on the training data

In [18]:
model = Prophet()
model.fit(train_data)

<fbprophet.forecaster.Prophet at 0x7f091afcc4d0>

Performing prediction on the validation dataset

In [19]:
prediction = model.predict(pd.DataFrame({'ds':validation_data['ds']}))
y_actual = validation_data['y']
y_predicted = prediction['yhat']
y_predicted = y_predicted.astype(int)
mean_absolute_error(y_actual, y_predicted)

29.6821046862154

Plotting results of predictions on validation dataset

In [20]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=validation_data['ds'], y=y_actual, name="actual targets"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=validation_data['ds'], y=y_predicted, name="predicted targets"),
    secondary_y=True,
)

fig.update_layout(
    title_text="Actual vs Predicted Targets"
)

fig.update_xaxes(title_text="Timeline")
fig.update_yaxes(title_text="<b>actual</b> targets", secondary_y=False)
fig.update_yaxes(title_text="<b>predicted</b> targets", secondary_y=True)

fig.show()

# Predictions on test dataset

In [21]:
test_data = pd.read_csv('/content/train_E1GspfA.csv')
print(f'test dataset size : {test_data.shape}')
testing_data = dataPreprocessing(test_data.copy())
testing_data.head()

test dataset size : (18247, 3)


Unnamed: 0,date,demand
0,2018-08-18 09:00:00,91
1,2018-08-18 10:00:00,21
2,2018-08-18 13:00:00,23
3,2018-08-18 14:00:00,104
4,2018-08-18 15:00:00,81


In [22]:
test_prediction = model.predict(pd.DataFrame({'ds':testing_data['date']}))

In [23]:
test_prediction = test_prediction['yhat']
test_prediction = test_prediction.astype(int)
test_data['demand'] = test_prediction
test_data.head()
test_data.to_csv('submission.csv', index=False)