REAL TIME PARKING PREDICTION SYSTEM

1. Import Modules

In [161]:
import numpy as np
import pandas as pd
import pickle
import requests


from datetime import datetime, timedelta

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn import linear_model


2. Load Dataset

In [115]:
# Load CSV file as Pandas DataFrame
df = pd.read_csv("final_parking_dataset.csv")

# Show first 5 rows
df.head()

Unnamed: 0,date_time,parking_zone
0,17/7/2022 8:47,Zone 1
1,17/7/2022 9:02,Zone 1
2,17/7/2022 11:15,Zone 1
3,18/7/2022 8:57,Zone 2
4,18/7/2022 9:12,Zone 3


3. Feature Engineering

In [116]:
# Convert `date_time` column into a datetime column
df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y %H:%M')

In [117]:
# See data types of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date_time     245 non-null    datetime64[ns]
 1   parking_zone  245 non-null    object        
dtypes: datetime64[ns](1), object(1)
memory usage: 4.0+ KB


In [118]:
print(f"Total number of records: {len(df)}")
print("Breakdown of parking zones:\n")
print(df.parking_zone.value_counts())

Total number of records: 245
Breakdown of parking zones:

Zone 1    114
Zone 2     73
Zone 3     58
Name: parking_zone, dtype: int64


In [119]:
# Create new features from `date_time` column
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['day_of_week'] = df['date_time'].dt.weekday
df['hour'] = df['date_time'].dt.hour
df['minute'] = df['date_time'].dt.minute
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.strftime('%H:%M')
df['hour_min'] = round(df['hour'] + (df['minute'] / 60), 1)

# Show first 5 rows
df.head()

Unnamed: 0,date_time,parking_zone,year,month,day,day_of_week,hour,minute,date,time,hour_min
0,2022-07-17 08:47:00,Zone 1,2022,7,17,6,8,47,2022-07-17,08:47,8.8
1,2022-07-17 09:02:00,Zone 1,2022,7,17,6,9,2,2022-07-17,09:02,9.0
2,2022-07-17 11:15:00,Zone 1,2022,7,17,6,11,15,2022-07-17,11:15,11.2
3,2022-07-18 08:57:00,Zone 2,2022,7,18,0,8,57,2022-07-18,08:57,9.0
4,2022-07-18 09:12:00,Zone 3,2022,7,18,0,9,12,2022-07-18,09:12,9.2


In [120]:
# Convert `month` to categorical
df['month'].replace({1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
                     7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}, inplace=True)
df['month'] = df['month'].astype('category') 
df['month'].cat.set_categories(new_categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], ordered=True, inplace=True)

# Convert `day_of_week` to categorical
df['day_of_week'].replace({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}, inplace=True)
df['day_of_week'] = df['day_of_week'].astype('category') 
df['day_of_week'].cat.set_categories(new_categories=['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'], 
                                     ordered=True, inplace=True)

# Convert `hour` to categorical
df['hour'] = df['hour'].astype('category') 
df['hour'].cat.set_categories(new_categories=list(range(24)), ordered=True, inplace=True)

# Convert `parking_zone` to categorical
df['parking_zone'] = df['parking_zone'].astype('category') 
df['parking_zone'].cat.set_categories(new_categories=['Zone 1', 'Zone 2', 'Zone 3'], ordered=True, inplace=True)

# Show first 5 rows
df.head()

Unnamed: 0,date_time,parking_zone,year,month,day,day_of_week,hour,minute,date,time,hour_min
0,2022-07-17 08:47:00,Zone 1,2022,Jul,17,Sun,8,47,2022-07-17,08:47,8.8
1,2022-07-17 09:02:00,Zone 1,2022,Jul,17,Sun,9,2,2022-07-17,09:02,9.0
2,2022-07-17 11:15:00,Zone 1,2022,Jul,17,Sun,11,15,2022-07-17,11:15,11.2
3,2022-07-18 08:57:00,Zone 2,2022,Jul,18,Mon,8,57,2022-07-18,08:57,9.0
4,2022-07-18 09:12:00,Zone 3,2022,Jul,18,Mon,9,12,2022-07-18,09:12,9.2


In [121]:
def parse_date(date_string):
    """Parse a date string in YYYY-MM-DD format as a datetime object."""
    
    return datetime.strptime(date_string, '%Y-%m-%d').date()

In [122]:
def get_ph_and_eve(year='2021'):
    """Return lists of parsed dates for Singapore's public holidays and public holday eves."""
    
    if year == '2021':
        url = 'https://data.gov.sg/api/action/datastore_search?resource_id=550f6e9e-034e-45a7-a003-cf7f7e252c9a&'
    
    elif year == '2022':
        url = 'https://data.gov.sg/api/action/datastore_search?resource_id=04a78f5b-2d12-4695-a6cd-d2b072bc93fe&'
    
    data = requests.get(url).json()
    ph_parsed = [parse_date(ele['date']) for ele in data['result']['records']]
    eve_parsed = [parse_date(ele['date']) - timedelta(days=1) for ele in data['result']['records']]
    
    return ph_parsed, eve_parsed

In [123]:
ph_parsed, eve_parsed = get_ph_and_eve(year='2021')
print(f'2021 Public Holidays: {ph_parsed}')
print()
print(f'2021 Public Holiday Eves: {eve_parsed}')

2021 Public Holidays: [datetime.date(2021, 1, 1), datetime.date(2021, 2, 12), datetime.date(2021, 2, 13), datetime.date(2021, 4, 2), datetime.date(2021, 5, 1), datetime.date(2021, 5, 13), datetime.date(2021, 5, 26), datetime.date(2021, 7, 20), datetime.date(2021, 8, 9), datetime.date(2021, 11, 4), datetime.date(2021, 12, 25)]

2021 Public Holiday Eves: [datetime.date(2020, 12, 31), datetime.date(2021, 2, 11), datetime.date(2021, 2, 12), datetime.date(2021, 4, 1), datetime.date(2021, 4, 30), datetime.date(2021, 5, 12), datetime.date(2021, 5, 25), datetime.date(2021, 7, 19), datetime.date(2021, 8, 8), datetime.date(2021, 11, 3), datetime.date(2021, 12, 24)]


In [124]:
# Create features for dates of public holidays and public holiday eves
ph, eve = get_ph_and_eve(year='2021')
df['ph'] = np.where(df['date'].isin(ph), 'ph', '')
df['eve'] = np.where(df['date'].isin(eve), 'eve', '')
df['ph_eve'] = df['ph'] + df['eve']
df['ph_eve'].replace('', 'nil', inplace=True)

# Select and rearrange columns
df = df[['date', 'time', 'month', 'day', 'day_of_week', 'hour', 
         'minute', 'hour_min', 'ph_eve', 'parking_zone']] 


In [125]:
# Show first 5 rows
df.head()

Unnamed: 0,date,time,month,day,day_of_week,hour,minute,hour_min,ph_eve,parking_zone
0,2022-07-17,08:47,Jul,17,Sun,8,47,8.8,nil,Zone 1
1,2022-07-17,09:02,Jul,17,Sun,9,2,9.0,nil,Zone 1
2,2022-07-17,11:15,Jul,17,Sun,11,15,11.2,nil,Zone 1
3,2022-07-18,08:57,Jul,18,Mon,8,57,9.0,nil,Zone 2
4,2022-07-18,09:12,Jul,18,Mon,9,12,9.2,nil,Zone 3


In [126]:
def extract_all_features(data, year='2021'):
    """Extract all features given a DataFrame containing a datetime string in YYYY-MM-DD HH:MM:SS format."""
    
    df = data.copy()
    df['date_time'] = pd.to_datetime(df['date_time'], format='%d/%m/%Y %H:%M')
    
    # Create new features from `date_time`
    df['year'] = df['date_time'].dt.year
    df['month'] = df['date_time'].dt.month
    df['day'] = df['date_time'].dt.day
    df['day_of_week'] = df['date_time'].dt.weekday
    df['hour'] = df['date_time'].dt.hour
    df['minute'] = df['date_time'].dt.minute
    df['date'] = df['date_time'].dt.date
    df['time'] = df['date_time'].dt.strftime('%H:%M')
    df['hour_min'] = round(df['hour'] + (df['minute'] / 60), 1)
   
    # Convert `month` to categorical
    df['month'].replace({1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
                         7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}, inplace=True)
    df['month'] = df['month'].astype('category') 
    df['month'].cat.set_categories(new_categories=['Jan', 'Feb', 'Mar', 
                                                   'Apr', 'May', 'Jun', 
                                                   'Jul', 'Aug', 'Sep', 
                                                   'Oct', 'Nov', 'Dec'], 
                                   ordered=True, inplace=True)

    # Convert `day_of_week` to categorical
    df['day_of_week'].replace({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}, inplace=True)
    df['day_of_week'] = df['day_of_week'].astype('category') 
    df['day_of_week'].cat.set_categories(new_categories=['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'], 
                                         ordered=True, inplace=True)
 # Convert `hour` to categorical
    df['hour'] = df['hour'].astype('category') 
    df['hour'].cat.set_categories(new_categories=list(range(24)), ordered=True, inplace=True)

    # Create features for dates of public holidays and public holiday eves
    ph, eve = get_ph_and_eve(year=year)
    df['ph'] = np.where(df['date'].isin(ph), 'ph', '')
    df['eve'] = np.where(df['date'].isin(eve), 'eve', '')
    df['ph_eve'] = df['ph'] + df['eve']
    df['ph_eve'].replace('', 'nil', inplace=True)

    # Select and rearrange columns
    try:
        df = df[['date', 'time', 'month', 'day', 'day_of_week', 'hour', 'minute', 'hour_min', 'ph_eve', 'parking_zone']] 

    except KeyError:
        df = df[['date', 'time', 'month', 'day', 'day_of_week', 'hour', 'minute', 'hour_min', 'ph_eve']] 
          
    return df

In [127]:
df['parking_zone'] = df['parking_zone'].replace({'Zone 1':1, 'Zone 2' :2, 'Zone 3' :3})

In [128]:
df

Unnamed: 0,date,time,month,day,day_of_week,hour,minute,hour_min,ph_eve,parking_zone
0,2022-07-17,08:47,Jul,17,Sun,8,47,8.8,nil,1
1,2022-07-17,09:02,Jul,17,Sun,9,2,9.0,nil,1
2,2022-07-17,11:15,Jul,17,Sun,11,15,11.2,nil,1
3,2022-07-18,08:57,Jul,18,Mon,8,57,9.0,nil,2
4,2022-07-18,09:12,Jul,18,Mon,9,12,9.2,nil,3
...,...,...,...,...,...,...,...,...,...,...
240,2022-09-21,08:45,Sep,21,Wed,8,45,8.8,nil,2
241,2022-09-21,10:45,Sep,21,Wed,10,45,10.8,nil,3
242,2022-09-22,08:47,Sep,22,Thu,8,47,8.8,nil,2
243,2022-09-22,10:55,Sep,22,Thu,10,55,10.9,nil,3


In [129]:
# Load CSV file as Pandas DataFrame
df = pd.read_csv("final_parking_dataset.csv")
df = extract_all_features(df, '2022')

4. Exploratory Data Analysis

In [130]:
import chart_studio.plotly as py
import chart_studio
import plotly.graph_objects as go

In [131]:
# Add my username
username = 'zeyalt' 

# Add my api key
api_key = 'j6vY7DJ6EPjEiEjxrIsj'  

# Set up connection to chart studio
chart_studio.tools.set_credentials_file(username = username, api_key = api_key)

In [132]:
df1 = df.replace({'eve': 'Public Holiday Eve', 'nil': 'Neither', 'ph': 'Public Holiday'})
df1.head()

Unnamed: 0,date,time,month,day,day_of_week,hour,minute,hour_min,ph_eve,parking_zone
0,2022-07-17,08:47,Jul,17,Sun,8,47,8.8,Neither,Zone 1
1,2022-07-17,09:02,Jul,17,Sun,9,2,9.0,Neither,Zone 1
2,2022-07-17,11:15,Jul,17,Sun,11,15,11.2,Neither,Zone 1
3,2022-07-18,08:57,Jul,18,Mon,8,57,9.0,Neither,Zone 2
4,2022-07-18,09:12,Jul,18,Mon,9,12,9.2,Neither,Zone 3


In [133]:
def plot_bar(data, variable, colour='cadetblue'):
    """Plot bar chart of a single variable."""
    
    mapping = {'month': 'Month', 'day_of_week': 'Day of the Week', 
               'hour': 'Hour', 'parking_zone': 'Parking Zone'}
    
    grouped_data = data.groupby(by=variable).size().reset_index(name="Count")

    fig = go.Figure()
    
    fig.add_trace(
        go.Bar(
        x = grouped_data[variable],
        y = grouped_data['Count'],
        marker_color=colour
        )
    )

    fig.update_layout(
        title="",
        xaxis_title=mapping.get(variable),
        yaxis_title="Number of Parking Sessions")

    fig.show()
    py.plot(fig, filename="by_" + variable, auto_open=True)

4.1 Distribution of Parking Sessions by month

In [134]:
plot_bar(df1, "month", colour='chocolate')

4.2 Distribution of Parking Sessions by day of the week

In [135]:
plot_bar(data=df1, variable="day_of_week", colour='forestgreen')

4.3 Distribution of Parking Sessions by hour of the day

In [136]:
plot_bar(data=df1, variable="hour", colour='maroon')

4.4 Distribution of Parking Sessions by parking zones(Class Distribution)

In [137]:
plot_bar(data=df1, variable="parking_zone", colour='crimson')

4.54.4 Distribution by hour of the day and parking zones

In [138]:
def plot_stacked_bar(data, primary_var, stacking_var, by_percentage=False):
    
    if by_percentage:
        grouped_data = data.groupby(by=[primary_var, stacking_var]).size()\
                            .groupby(level=0)\
                            .apply(lambda x: round(100 * x / float(x.sum()), 1))\
                            .reset_index(name="Count")
        yaxis_label = "Percentage of Parking Sessions (%)"
    
    else:
        grouped_data = data.groupby(by=[primary_var, stacking_var]).size()\
                            .reset_index(name="Count")
        yaxis_label = "Number of Parking Sessions"
        
    mapping = {'month': 'Month', 'day_of_week': 'Day of the Week', 
               'hour': 'Hour', 'parking_zone': 'Parking Zone'}
    
    if primary_var == 'hour': 
        marker_colors = ["#6ed2b0", "#3abf91", "#2a8867", "#19513e"]

    elif primary_var == 'day_of_week':
        marker_colors = ["#e7d1a1", "#d9b568", "#ab832a", "#554115"]
        
    elif primary_var == 'ph_eve':
        marker_colors = ["#aea4e4", "#7d6dd3", "#4231a5", "#211852"]
    
    stack = data[stacking_var].unique()
    data = []
    for zone, color in zip(sorted(stack), marker_colors):
        data.append(go.Bar(name=zone, x=grouped_data[grouped_data[stacking_var] == zone][primary_var], 
                           y=grouped_data[grouped_data[stacking_var] == zone]['Count'], marker_color=color))
    fig = go.Figure(data=data)
    fig.update_layout(barmode='stack',
                      title="",
                      xaxis_title=mapping.get(primary_var),
                      yaxis_title=yaxis_label, 
                      xaxis_nticks=36)

    fig.show()
    py.plot(fig, filename=primary_var + "_stacked_by_" + stacking_var, auto_open=True)    

In [139]:
def plot_heatmap(data, variable1, variable2, colorscale="Purples"):
    """Plot heatmap of two variables."""
    
    mapping = {'month': 'Month', 'day_of_week': 'Day of the Week', 
               'hour': 'Hour', 'parking_zone': 'Parking Zone'}
    
    heatmap_data = data.groupby(by=[variable1, variable2])["date"].count().to_frame('Count').reset_index()
    heatmap_data = heatmap_data.pivot(index=variable1, columns=variable2, values="Count")

    fig = go.Figure(data=go.Heatmap(
            z=heatmap_data,
            x=heatmap_data.columns,
            y=heatmap_data.index,
            colorscale=colorscale))

    fig.update_layout(
        title="",
        xaxis_title=mapping.get(variable2),
        yaxis_title=mapping.get(variable1), 
        xaxis_nticks=36)

    fig.show()
#     py.plot(fig, filename=variable1 + "_by_" + variable2, auto_open=True)

In [140]:
plot_heatmap(df1, 'parking_zone', 'hour')
plot_stacked_bar(df1, 'hour', 'parking_zone', by_percentage=True)

4.6 Distribution by the day of the week and parking zone

In [141]:
plot_heatmap(df1, 'parking_zone', 'day_of_week')
plot_stacked_bar(df1, 'day_of_week', 'parking_zone', by_percentage=True)

4.7 Heatmap of  day of the week by month

In [142]:
plot_heatmap(data=df1, variable1="day_of_week", variable2="month", colorscale="Blues")

4.8 Heatmap of day of the week by hour

In [143]:
plot_heatmap(data=df1, variable1="day_of_week", variable2="hour", colorscale="Oranges")

6. Building a baseline model

In [149]:
# Make a copy of `df`
train_df = df.copy()

# Select features
train_df = train_df[["hour_min", "day_of_week","ph_eve", "parking_zone"]]
# Show first 5 rows
train_df.head()

Unnamed: 0,hour_min,day_of_week,ph_eve,parking_zone
0,8.8,Sun,nil,Zone 1
1,9.0,Sun,nil,Zone 1
2,11.2,Sun,nil,Zone 1
3,9.0,Mon,nil,Zone 2
4,9.2,Mon,nil,Zone 3


In [163]:
# Create a `Pipeline` object with `StandardScaler` for numerical feature
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

# Create a `OneHotEncoder` object for categorical features 
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Create a `ColumnTransfomer` object 
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ["hour_min"]),
        ("cat", categorical_transformer, ["day_of_week", "ph_eve"]),
    ]
)

# Append a Logistic Regression classifier
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier",LogisticRegression(multi_class='multinomial'))]
)

In [164]:
# Split data into features and target variable
X = train_df.drop(columns='parking_zone')
y = train_df['parking_zone']

# Split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the full prediction pipeline to the training data
clf.fit(X_train, y_train)

# Generate predictions on validation data
y_pred = clf.predict(X_test)

# View classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Zone 1       0.47      0.71      0.57        21
      Zone 2       0.75      0.16      0.26        19
      Zone 3       0.31      0.44      0.36         9

    accuracy                           0.45        49
   macro avg       0.51      0.44      0.40        49
weighted avg       0.55      0.45      0.41        49



In [166]:
# Saving the model
filename = 'modelfinal.pkl'
pickle.dump(clf, open(filename, 'wb'))

7. Generating inferences

In [167]:
# Sample date and time inputs 
# date, time = '2022-01-12', '16:05'
date, time = '2023-7-04', '11:30'

# Save input as a DataFrame
predict_df = pd.DataFrame({'date_time': date + ' ' + time}, index=[0])
predict_df['date_time'] = pd.to_datetime(predict_df['date_time'])
predict_df = extract_all_features(predict_df, '2022')
predict_df = predict_df[['hour_min', 'day_of_week', 'ph_eve']]

# Generate predictions
print(f"Predicted parking zone: {clf.predict(predict_df)[0]}")
print(f"Predicted probabilities: {clf.predict_proba(predict_df)[0]}")

Predicted parking zone: Zone 1
Predicted probabilities: [0.52213597 0.28811372 0.18975031]
