## TTC Streetcar Statisical Analysis: 
### K-Nearest Neighbours Predictive Modelling on 'Min Delay' Groups

### Importing packages

In [1]:
#Importing packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as st

# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Allows us to look at the full dataset without truncating data
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

### TTC Dataset Download

In [3]:
import requests

# Toronto Open Data is stored in a CKAN instance. It's APIs are documented here:
# https://docs.ckan.org/en/latest/api/

# To hit our API, you'll be making requests to:
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# Datasets are called "packages". Each package can contain many "resources"
# To retrieve the metadata for this package and its resources, use the package name in this page's URL:
url = base_url + "/api/3/action/package_show"
params = { "id": "ttc-streetcar-delay-data"}
package = requests.get(url, params = params).json()

# To get resource data:
for idx, resource in enumerate(package["result"]["resources"]):
    print('{idx}, {name}, {resource}'.format(idx=idx, name=resource['name'], resource=resource))

    # To get metadata for non datastore_active resources:
    if not resource["datastore_active"]:
        url = base_url + "/api/3/action/resource_show?id=" + resource["id"]
        resource_metadata = requests.get(url).json()
        print(resource_metadata)
        # From here, you can use the "url" attribute to download this file

0, ttc-streetcar-delay-data-readme, {'cache_last_updated': None, 'cache_url': None, 'created': '2019-07-23T18:12:22.590326', 'datastore_active': False, 'format': 'XLSX', 'hash': '', 'id': '0fe61851-c67b-49bc-8c27-3a89b33b43af', 'is_datastore_cache_file': False, 'is_preview': 'False', 'last_modified': '2022-04-05T19:14:38', 'metadata_modified': '2022-04-12T18:45:32.386401', 'mimetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'mimetype_inner': None, 'name': 'ttc-streetcar-delay-data-readme', 'package_id': 'b68cb71b-44a7-4394-97e2-5d2f41462a5d', 'position': 0, 'resource_type': None, 'revision_id': '3cfbbfd6-02f1-4f06-ae0c-df11d4485b90', 'size': 12886, 'state': 'active', 'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/b68cb71b-44a7-4394-97e2-5d2f41462a5d/resource/0fe61851-c67b-49bc-8c27-3a89b33b43af/download/ttc-streetcar-delay-data-readme.xlsx', 'url_type': 'upload'}
{'help': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/help_sh

{'help': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/help_show?name=resource_show', 'success': True, 'result': {'cache_last_updated': None, 'cache_url': None, 'created': '2019-07-23T18:12:21.771553', 'datastore_active': False, 'format': 'XLSX', 'hash': '', 'id': '437a5889-2bc9-4bd3-9561-0ae4a9722655', 'is_datastore_cache_file': False, 'is_preview': 'True', 'last_modified': '2022-04-05T19:14:00', 'metadata_modified': '2022-04-12T18:45:49.226782', 'mimetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'mimetype_inner': None, 'name': 'ttc-streetcar-delay-data-2019', 'package_id': 'b68cb71b-44a7-4394-97e2-5d2f41462a5d', 'position': 6, 'resource_type': None, 'revision_id': '3cfbbfd6-02f1-4f06-ae0c-df11d4485b90', 'size': 759867, 'state': 'active', 'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/b68cb71b-44a7-4394-97e2-5d2f41462a5d/resource/437a5889-2bc9-4bd3-9561-0ae4a9722655/download/ttc-streetcar-delay-data-2019.xlsx', 'url_type':

{'help': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/help_show?name=resource_show', 'success': True, 'result': {'cache_last_updated': None, 'cache_url': None, 'created': '2024-02-12T21:10:56.622203', 'datastore_active': False, 'extract_job': 'Airflow: upload_remote_files', 'format': 'XLSX', 'hash': '', 'id': '5f527714-2284-437b-958b-c02b6f21eb9d', 'is_preview': 'False', 'last_modified': '2024-02-12T20:43:55', 'metadata_modified': '2024-02-12T21:10:58.160033', 'mimetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'mimetype_inner': None, 'name': 'ttc-streetcar-delay-data-2024', 'package_id': 'b68cb71b-44a7-4394-97e2-5d2f41462a5d', 'position': 12, 'resource_type': None, 'size': 91628, 'state': 'active', 'url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/b68cb71b-44a7-4394-97e2-5d2f41462a5d/resource/5f527714-2284-437b-958b-c02b6f21eb9d/download/ttc-streetcar-delay-data-2024.xlsx', 'url_type': 'upload'}}


In [4]:
for idx, resource in enumerate(package["result"]["resources"]):
    xlsx_url = resource["url"]
    response = requests.get(xlsx_url)
    
    if response.status_code == 200:
        resourcename = resource['name'].replace("\t","") # This code is to take into account a naming error at datasource
        
        # Save streetcar data as .xlsx file
        xlsx_filename = f"{resourcename}.xlsx"
        with open(xlsx_filename, "wb") as file:
            file.write(response.content)
        print(f"XLSX resource downloaded successfully. Check '{xlsx_filename}' file.")
    else:
        print("Failed to download the XLSX resource.")
        break  # Exit the loop if there was an error

XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-readme.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2014.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2015.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2016.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2017.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2018.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2019.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2020.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2021.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2020.xlsx' file.
XLSX resource downloaded successfully. Check 'ttc-streetcar-delay-data-2022.xlsx' file.
XLSX resource downloaded succe

### Dataset Contents

In [5]:
df_readme = pd.read_excel('ttc-streetcar-delay-data-readme.xlsx')
df_readme

Unnamed: 0,Field Name,Description,Example
0,Report Date,The date (YYYY/MM/DD) when the delay-causing incident occurred,2017-06-20 00:00:00
1,Route,The number of the streetcar route,51
2,Time,The time (hh:mm:ss AM/PM) when the delay-causing incident occurred,00:35:00
3,Day,The name of the day,Monday
4,Location,The location of the delay-causing incident,York Mills Station
5,Incident,The description of the delay-causing incident,Mechanical
6,Min Delay,"The delay, in minutes, to the schedule for the following streetcar",10
7,Min Gap,"The total scheduled time, in minutes, from the streetcar ahead of the following streetcar",20
8,Direction,"The direction of the bus route where B,b or BW indicates both ways. (On an east west route, it includes both east and west) NB - northbound, \nSB - southbound, \nEB - eastbound, \nWB - westbound\n\nThe direction is not case sensitive",N
9,Vehicle,Vehicle number,1057


In [6]:
# Import 2022 Streetcar Dataset
df = pd.read_excel('ttc-streetcar-delay-data-2022.xlsx')

In [7]:
# Create groups for Min Delay
maxDelay = df['Min Delay'].max() + 1

df['MinDelayGroup'] = pd.cut(df['Min Delay'], bins=[0,1,5,10,15,20,maxDelay], 
    right=False, labels=['< 1 minute', '1-4 minutes', '5-9 minutes', '10-14 minutes', '15-19 minutes','20+ minutes'])

# Remove 'Min Delay' column
df.drop('Min Delay',inplace=True,axis=1)

In [8]:
df['MinDelayGroup'].value_counts()

5-9 minutes      9588
10-14 minutes    3489
20+ minutes      2157
< 1 minute        915
15-19 minutes     878
1-4 minutes       628
Name: MinDelayGroup, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17655 entries, 0 to 17654
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           17655 non-null  datetime64[ns]
 1   Line           17612 non-null  object        
 2   Time           17655 non-null  object        
 3   Day            17655 non-null  object        
 4   Location       17655 non-null  object        
 5   Incident       17655 non-null  object        
 6   Min Gap        17655 non-null  int64         
 7   Bound          14606 non-null  object        
 8   Vehicle        17655 non-null  int64         
 9   MinDelayGroup  17655 non-null  category      
dtypes: category(1), datetime64[ns](1), int64(2), object(6)
memory usage: 1.2+ MB


### K-Nearest Neighbours Predictive Modelling on 'Min Delay' Groups

### 'Incident' Data

In [10]:
# Assemble dataset for 'Incident' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Incident']]

MLdata.head(5)

Unnamed: 0,MinDelayGroup,Incident
0,20+ minutes,Collision - TTC Involved
1,15-19 minutes,Operations
2,15-19 minutes,Operations
3,20+ minutes,Operations
4,5-9 minutes,Security


In [11]:
# One-hot encoding for 'Incident' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Incident']]).toarray()
incident = [f'Incident_{item}' for item in encoder.categories_[0]]
MLdata[incident] = OneHotArray

# Remove 'Incident' (object)
MLdata.drop('Incident',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Incident_Cleaning - Disinfection,Incident_Cleaning - Unsanitary,Incident_Collision - TTC Involved,Incident_Diversion,Incident_Emergency Services,Incident_General Delay,Incident_Held By,Incident_Investigation,Incident_Late Entering Service,Incident_Mechanical,Incident_Operations,Incident_Overhead,Incident_Rail/Switches,Incident_Security,Incident_Utilized Off Route
0,20+ minutes,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15-19 minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,15-19 minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,20+ minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5-9 minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17655 entries, 0 to 17654
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   MinDelayGroup                      17655 non-null  category
 1   Incident_Cleaning - Disinfection   17655 non-null  float64 
 2   Incident_Cleaning - Unsanitary     17655 non-null  float64 
 3   Incident_Collision - TTC Involved  17655 non-null  float64 
 4   Incident_Diversion                 17655 non-null  float64 
 5   Incident_Emergency Services        17655 non-null  float64 
 6   Incident_General Delay             17655 non-null  float64 
 7   Incident_Held By                   17655 non-null  float64 
 8   Incident_Investigation             17655 non-null  float64 
 9   Incident_Late Entering Service     17655 non-null  float64 
 10  Incident_Mechanical                17655 non-null  float64 
 11  Incident_Operations                17655 

In [13]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [14]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:15]
response = MLarray[:,0]

In [15]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [16]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.247433 (0.023489)


### 'Line' Data

In [17]:
# Assemble dataset for 'Line' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Line']]

# Set column as a 'string'
MLdata['Line'] = MLdata['Line'].astype(str)

# Include only valid streetcar lines
MLdata = MLdata[(MLdata['Line'] == '301') | (MLdata['Line'] == '304') | (MLdata['Line'] == '306') |\
    (MLdata['Line'] == '310') | (MLdata['Line'] == '500') | (MLdata['Line'] == '501') | (MLdata['Line'] == '503') |\
    (MLdata['Line'] == '504') | (MLdata['Line'] == '505') | (MLdata['Line'] == '506') | (MLdata['Line'] == '508') |\
    (MLdata['Line'] == '509') | (MLdata['Line'] == '510') | (MLdata['Line'] == '511') | (MLdata['Line'] == '512')]

In [18]:
MLdata['Line'].unique()

array(['504', '501', '510', '301', '505', '512', '511', '506', '509',
       '503', '304', '306', '310', '500', '508'], dtype=object)

In [19]:
# One-hot encoding for 'Line' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Line']]).toarray()
line = [f'Line_{item}' for item in encoder.categories_[0]]
MLdata[line] = OneHotArray

# Remove 'Line' (object)
MLdata.drop('Line',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Line_301,Line_304,Line_306,Line_310,Line_500,Line_501,Line_503,Line_504,Line_505,Line_506,Line_508,Line_509,Line_510,Line_511,Line_512
0,20+ minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15-19 minutes,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15-19 minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20+ minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5-9 minutes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17558 entries, 0 to 17654
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MinDelayGroup  17558 non-null  category
 1   Line_301       17558 non-null  float64 
 2   Line_304       17558 non-null  float64 
 3   Line_306       17558 non-null  float64 
 4   Line_310       17558 non-null  float64 
 5   Line_500       17558 non-null  float64 
 6   Line_501       17558 non-null  float64 
 7   Line_503       17558 non-null  float64 
 8   Line_504       17558 non-null  float64 
 9   Line_505       17558 non-null  float64 
 10  Line_506       17558 non-null  float64 
 11  Line_508       17558 non-null  float64 
 12  Line_509       17558 non-null  float64 
 13  Line_510       17558 non-null  float64 
 14  Line_511       17558 non-null  float64 
 15  Line_512       17558 non-null  float64 
dtypes: category(1), float64(15)
memory usage: 2.2 MB


In [21]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [22]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:15]
response = MLarray[:,0]

In [23]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [24]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.182894 (0.003631)


### 'Month' Data

In [25]:
# Assemble dataset for 'Month' Data
df['Month'] = df['Date'].dt.month
MLdata = df.loc[:, ['MinDelayGroup', 'Month']]

# Set 'Month' column as a 'string'
MLdata['Month'] = MLdata['Month'].astype(str)

In [26]:
# One-hot encoding for 'Month' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Month']]).toarray()
month = [f'Month_{item}' for item in encoder.categories_[0]]
MLdata[month] = OneHotArray

# Remove 'Month' (object)
MLdata.drop('Month',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Month_1,Month_10,Month_11,Month_12,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9
0,20+ minutes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15-19 minutes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15-19 minutes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20+ minutes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5-9 minutes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17655 entries, 0 to 17654
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MinDelayGroup  17655 non-null  category
 1   Month_1        17655 non-null  float64 
 2   Month_10       17655 non-null  float64 
 3   Month_11       17655 non-null  float64 
 4   Month_12       17655 non-null  float64 
 5   Month_2        17655 non-null  float64 
 6   Month_3        17655 non-null  float64 
 7   Month_4        17655 non-null  float64 
 8   Month_5        17655 non-null  float64 
 9   Month_6        17655 non-null  float64 
 10  Month_7        17655 non-null  float64 
 11  Month_8        17655 non-null  float64 
 12  Month_9        17655 non-null  float64 
dtypes: category(1), float64(12)
memory usage: 1.6 MB


In [28]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [29]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:12]
response = MLarray[:,0]

In [30]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [31]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.166667 (0.000000)


### 'Day of Week' Data

In [32]:
# Assemble dataset for 'Day' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Day']]

In [33]:
# One-hot encoding for 'Day' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Day']]).toarray()
day = [f'Day_{item}' for item in encoder.categories_[0]]
MLdata[day] = OneHotArray

# Remove 'Day' (object)
MLdata.drop('Day',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
0,20+ minutes,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,15-19 minutes,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,15-19 minutes,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,20+ minutes,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5-9 minutes,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [34]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17655 entries, 0 to 17654
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MinDelayGroup  17655 non-null  category
 1   Day_Friday     17655 non-null  float64 
 2   Day_Monday     17655 non-null  float64 
 3   Day_Saturday   17655 non-null  float64 
 4   Day_Sunday     17655 non-null  float64 
 5   Day_Thursday   17655 non-null  float64 
 6   Day_Tuesday    17655 non-null  float64 
 7   Day_Wednesday  17655 non-null  float64 
dtypes: category(1), float64(7)
memory usage: 983.1 KB


In [35]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [36]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:7]
response = MLarray[:,0]

In [37]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [38]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.166667 (0.000000)


### 'Hour' Data

In [39]:
# Assemble dataset for 'Hour' Data
df['Time'] = pd.to_datetime(df['Time'])
df['Hour'] = df['Time'].dt.hour
MLdata = df.loc[:, ['MinDelayGroup', 'Hour']]

In [40]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17655 entries, 0 to 17654
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MinDelayGroup  17655 non-null  category
 1   Hour           17655 non-null  int64   
dtypes: category(1), int64(1)
memory usage: 155.5 KB


In [41]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [42]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1]
response = MLarray[:,0]

In [43]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [44]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: nan (nan)


10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chukatri\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chukatri\anaconda3\lib\site-packages\sklearn\neighbors\_classification.py", line 198, in fit
    return self._fit(X, y)
  File "C:\Users\chukatri\anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 400, in _fit
    X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True)
  File "C:\Users\chukatri\anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_data

### Time-related Predictors (Hour, Day, Month)

In [45]:
# Assemble dataset for Time-related Data
MLdata = df.loc[:, ['MinDelayGroup', 'Hour', 'Day', 'Month']]

# Set all predictor columns as a 'string'
MLdata['Day'] = MLdata['Day'].astype(str)
MLdata['Month'] = MLdata['Month'].astype(str)

In [46]:
# One-hot encoding for 'Day' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Day']]).toarray()
day = [f'Day_{item}' for item in encoder.categories_[0]]
MLdata[day] = OneHotArray

# Remove 'Day' (object)
MLdata.drop('Day',inplace=True,axis=1)
#MLdata.head()

In [47]:
# One-hot encoding for 'Month' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Month']]).toarray()
month = [f'Month_{item}' for item in encoder.categories_[0]]
MLdata[month] = OneHotArray

# Remove 'Month' (object)
MLdata.drop('Month',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Hour,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,Month_1,...,Month_11,Month_12,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9
0,20+ minutes,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15-19 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15-19 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20+ minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5-9 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17655 entries, 0 to 17654
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MinDelayGroup  17655 non-null  category
 1   Hour           17655 non-null  int64   
 2   Day_Friday     17655 non-null  float64 
 3   Day_Monday     17655 non-null  float64 
 4   Day_Saturday   17655 non-null  float64 
 5   Day_Sunday     17655 non-null  float64 
 6   Day_Thursday   17655 non-null  float64 
 7   Day_Tuesday    17655 non-null  float64 
 8   Day_Wednesday  17655 non-null  float64 
 9   Month_1        17655 non-null  float64 
 10  Month_10       17655 non-null  float64 
 11  Month_11       17655 non-null  float64 
 12  Month_12       17655 non-null  float64 
 13  Month_2        17655 non-null  float64 
 14  Month_3        17655 non-null  float64 
 15  Month_4        17655 non-null  float64 
 16  Month_5        17655 non-null  float64 
 17  Month_6        17655 non-null  

In [49]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [50]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:20]
response = MLarray[:,0]

In [51]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [52]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.199163 (0.004595)


### Time-related Predictors & Line

In [53]:
# Assemble dataset for Time-related and 'Line' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Hour', 'Day', 'Month', 'Line']]

# Set all predictor columns as a 'string'
MLdata['Day'] = MLdata['Day'].astype(str)
MLdata['Month'] = MLdata['Month'].astype(str)
MLdata['Line'] = MLdata['Line'].astype(str)

# Include only valid streetcar lines
MLdata = MLdata[(MLdata['Line'] == '301') | (MLdata['Line'] == '304') | (MLdata['Line'] == '306') |\
    (MLdata['Line'] == '310') | (MLdata['Line'] == '500') | (MLdata['Line'] == '501') | (MLdata['Line'] == '503') |\
    (MLdata['Line'] == '504') | (MLdata['Line'] == '505') | (MLdata['Line'] == '506') | (MLdata['Line'] == '508') |\
    (MLdata['Line'] == '509') | (MLdata['Line'] == '510') | (MLdata['Line'] == '511') | (MLdata['Line'] == '512')]

In [54]:
# One-hot encoding for 'Day' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Day']]).toarray()
day = [f'Day_{item}' for item in encoder.categories_[0]]
MLdata[day] = OneHotArray

# Remove 'Day' (object)
MLdata.drop('Day',inplace=True,axis=1)
#MLdata.head()

In [55]:
# One-hot encoding for 'Month' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Month']]).toarray()
month = [f'Month_{item}' for item in encoder.categories_[0]]
MLdata[month] = OneHotArray

# Remove 'Month' (object)
MLdata.drop('Month',inplace=True,axis=1)
#MLdata.head()

In [56]:
# One-hot encoding for 'Line' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Line']]).toarray()
line = [f'Line_{item}' for item in encoder.categories_[0]]
MLdata[line] = OneHotArray

# Remove 'Line' (object)
MLdata.drop('Line',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Hour,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,Month_1,...,Line_501,Line_503,Line_504,Line_505,Line_506,Line_508,Line_509,Line_510,Line_511,Line_512
0,20+ minutes,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15-19 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15-19 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20+ minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5-9 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17558 entries, 0 to 17654
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MinDelayGroup  17558 non-null  category
 1   Hour           17558 non-null  int64   
 2   Day_Friday     17558 non-null  float64 
 3   Day_Monday     17558 non-null  float64 
 4   Day_Saturday   17558 non-null  float64 
 5   Day_Sunday     17558 non-null  float64 
 6   Day_Thursday   17558 non-null  float64 
 7   Day_Tuesday    17558 non-null  float64 
 8   Day_Wednesday  17558 non-null  float64 
 9   Month_1        17558 non-null  float64 
 10  Month_10       17558 non-null  float64 
 11  Month_11       17558 non-null  float64 
 12  Month_12       17558 non-null  float64 
 13  Month_2        17558 non-null  float64 
 14  Month_3        17558 non-null  float64 
 15  Month_4        17558 non-null  float64 
 16  Month_5        17558 non-null  float64 
 17  Month_6        17558 non-null  

In [58]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [59]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:35]
response = MLarray[:,0]

In [60]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [61]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.201431 (0.004650)


### Time-related Predictors & 'Incident'

In [62]:
# Assemble dataset for Time-related and 'Incident' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Hour', 'Day', 'Month', 'Incident']]

# Set all predictor columns as a 'string'
MLdata['Day'] = MLdata['Day'].astype(str)
MLdata['Month'] = MLdata['Month'].astype(str)
MLdata['Incident'] = MLdata['Incident'].astype(str)

In [63]:
# One-hot encoding for 'Day' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Day']]).toarray()
day = [f'Day_{item}' for item in encoder.categories_[0]]
MLdata[day] = OneHotArray

# Remove 'Day' (object)
MLdata.drop('Day',inplace=True,axis=1)
#MLdata.head()

In [64]:
# One-hot encoding for 'Month' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Month']]).toarray()
month = [f'Month_{item}' for item in encoder.categories_[0]]
MLdata[month] = OneHotArray

# Remove 'Month' (object)
MLdata.drop('Month',inplace=True,axis=1)
#MLdata.head()

In [65]:
# One-hot encoding for 'Incident' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Incident']]).toarray()
incident = [f'Incident_{item}' for item in encoder.categories_[0]]
MLdata[incident] = OneHotArray

# Remove 'Incident' (object)
MLdata.drop('Incident',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Hour,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday,Month_1,...,Incident_General Delay,Incident_Held By,Incident_Investigation,Incident_Late Entering Service,Incident_Mechanical,Incident_Operations,Incident_Overhead,Incident_Rail/Switches,Incident_Security,Incident_Utilized Off Route
0,20+ minutes,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15-19 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,15-19 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,20+ minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5-9 minutes,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [66]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17655 entries, 0 to 17654
Data columns (total 36 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   MinDelayGroup                      17655 non-null  category
 1   Hour                               17655 non-null  int64   
 2   Day_Friday                         17655 non-null  float64 
 3   Day_Monday                         17655 non-null  float64 
 4   Day_Saturday                       17655 non-null  float64 
 5   Day_Sunday                         17655 non-null  float64 
 6   Day_Thursday                       17655 non-null  float64 
 7   Day_Tuesday                        17655 non-null  float64 
 8   Day_Wednesday                      17655 non-null  float64 
 9   Month_1                            17655 non-null  float64 
 10  Month_10                           17655 non-null  float64 
 11  Month_11                           17655 

In [67]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [68]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:35]
response = MLarray[:,0]

In [69]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [70]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.208021 (0.005575)


### 'Line' & 'Incident'

In [71]:
# Assemble dataset for 'Line' and 'Incident' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Line', 'Incident']]

# Set all predictor columns as a 'string'
MLdata['Line'] = MLdata['Line'].astype(str)
MLdata['Incident'] = MLdata['Incident'].astype(str)

# Include only valid streetcar lines
MLdata = MLdata[(MLdata['Line'] == '301') | (MLdata['Line'] == '304') | (MLdata['Line'] == '306') |\
    (MLdata['Line'] == '310') | (MLdata['Line'] == '500') | (MLdata['Line'] == '501') | (MLdata['Line'] == '503') |\
    (MLdata['Line'] == '504') | (MLdata['Line'] == '505') | (MLdata['Line'] == '506') | (MLdata['Line'] == '508') |\
    (MLdata['Line'] == '509') | (MLdata['Line'] == '510') | (MLdata['Line'] == '511') | (MLdata['Line'] == '512')]

In [72]:
# One-hot encoding for 'Line' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Line']]).toarray()
line = [f'Line_{item}' for item in encoder.categories_[0]]
MLdata[line] = OneHotArray

# Remove 'Line' (object)
MLdata.drop('Line',inplace=True,axis=1)
#MLdata.head()

In [73]:
# One-hot encoding for 'Incident' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Incident']]).toarray()
incident = [f'Incident_{item}' for item in encoder.categories_[0]]
MLdata[incident] = OneHotArray

# Remove 'Incident' (object)
MLdata.drop('Incident',inplace=True,axis=1)
MLdata.head()

Unnamed: 0,MinDelayGroup,Line_301,Line_304,Line_306,Line_310,Line_500,Line_501,Line_503,Line_504,Line_505,...,Incident_General Delay,Incident_Held By,Incident_Investigation,Incident_Late Entering Service,Incident_Mechanical,Incident_Operations,Incident_Overhead,Incident_Rail/Switches,Incident_Security,Incident_Utilized Off Route
0,20+ minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15-19 minutes,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,15-19 minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,20+ minutes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5-9 minutes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [74]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17558 entries, 0 to 17654
Data columns (total 31 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   MinDelayGroup                      17558 non-null  category
 1   Line_301                           17558 non-null  float64 
 2   Line_304                           17558 non-null  float64 
 3   Line_306                           17558 non-null  float64 
 4   Line_310                           17558 non-null  float64 
 5   Line_500                           17558 non-null  float64 
 6   Line_501                           17558 non-null  float64 
 7   Line_503                           17558 non-null  float64 
 8   Line_504                           17558 non-null  float64 
 9   Line_505                           17558 non-null  float64 
 10  Line_506                           17558 non-null  float64 
 11  Line_508                           17558 

In [75]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [76]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:30]
response = MLarray[:,0]

In [77]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [78]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.257211 (0.010041)


### Time-related Predictors, 'Line' and 'Incident'

In [79]:
# Assemble dataset for Time-related and 'Line' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Hour', 'Day', 'Month', 'Line', 'Incident']]

# Set all predictor columns as a 'string'
MLdata['Day'] = MLdata['Day'].astype(str)
MLdata['Month'] = MLdata['Month'].astype(str)
MLdata['Line'] = MLdata['Line'].astype(str)
MLdata['Incident'] = MLdata['Incident'].astype(str)

# Include only valid streetcar lines
MLdata = MLdata[(MLdata['Line'] == '301') | (MLdata['Line'] == '304') | (MLdata['Line'] == '306') |\
    (MLdata['Line'] == '310') | (MLdata['Line'] == '500') | (MLdata['Line'] == '501') | (MLdata['Line'] == '503') |\
    (MLdata['Line'] == '504') | (MLdata['Line'] == '505') | (MLdata['Line'] == '506') | (MLdata['Line'] == '508') |\
    (MLdata['Line'] == '509') | (MLdata['Line'] == '510') | (MLdata['Line'] == '511') | (MLdata['Line'] == '512')]

In [80]:
# One-hot encoding for 'Day' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Day']]).toarray()
day = [f'Day_{item}' for item in encoder.categories_[0]]
MLdata[day] = OneHotArray

# Remove 'Day' (object)
MLdata.drop('Day',inplace=True,axis=1)
#MLdata.head()

In [81]:
# One-hot encoding for 'Month' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Month']]).toarray()
month = [f'Month_{item}' for item in encoder.categories_[0]]
MLdata[month] = OneHotArray

# Remove 'Month' (object)
MLdata.drop('Month',inplace=True,axis=1)
#MLdata.head()

In [82]:
# One-hot encoding for 'Line' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Line']]).toarray()
line = [f'Line_{item}' for item in encoder.categories_[0]]
MLdata[line] = OneHotArray

# Remove 'Line' (object)
MLdata.drop('Line',inplace=True,axis=1)
#MLdata.head()

In [83]:
# One-hot encoding for 'Incident' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Incident']]).toarray()
incident = [f'Incident_{item}' for item in encoder.categories_[0]]
MLdata[incident] = OneHotArray

# Remove 'Incident' (object)
MLdata.drop('Incident',inplace=True,axis=1)
#MLdata.head()

In [84]:
# Check that data are numeric
MLdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17558 entries, 0 to 17654
Data columns (total 51 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   MinDelayGroup                      17558 non-null  category
 1   Hour                               17558 non-null  int64   
 2   Day_Friday                         17558 non-null  float64 
 3   Day_Monday                         17558 non-null  float64 
 4   Day_Saturday                       17558 non-null  float64 
 5   Day_Sunday                         17558 non-null  float64 
 6   Day_Thursday                       17558 non-null  float64 
 7   Day_Tuesday                        17558 non-null  float64 
 8   Day_Wednesday                      17558 non-null  float64 
 9   Month_1                            17558 non-null  float64 
 10  Month_10                           17558 non-null  float64 
 11  Month_11                           17558 

In [85]:
# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

In [86]:
# Create arrays for predictor and response variables
predictors = MLarray[:,1:50]
response = MLarray[:,0]

In [87]:
# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

In [88]:
# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.206845 (0.004754)


### And the best model is...

The best model to predict 'Min Delay' group was: 'Streetcar Line' and 'Incident Type'


However, the predictive power (by balanced accuracy score for unbalanced datasets) was poor (25.7%).

### Running best predictor model with test data

In [89]:
# Assemble dataset for 'Line' and 'Incident' Data
MLdata = df.loc[:, ['MinDelayGroup', 'Line', 'Incident']]

# Set all predictor columns as a 'string'
MLdata['Line'] = MLdata['Line'].astype(str)
MLdata['Incident'] = MLdata['Incident'].astype(str)

# Include only valid streetcar lines
MLdata = MLdata[(MLdata['Line'] == '301') | (MLdata['Line'] == '304') | (MLdata['Line'] == '306') |\
    (MLdata['Line'] == '310') | (MLdata['Line'] == '500') | (MLdata['Line'] == '501') | (MLdata['Line'] == '503') |\
    (MLdata['Line'] == '504') | (MLdata['Line'] == '505') | (MLdata['Line'] == '506') | (MLdata['Line'] == '508') |\
    (MLdata['Line'] == '509') | (MLdata['Line'] == '510') | (MLdata['Line'] == '511') | (MLdata['Line'] == '512')]

# One-hot encoding for 'Line' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Line']]).toarray()
line = [f'Line_{item}' for item in encoder.categories_[0]]
MLdata[line] = OneHotArray

# Remove 'Line' (object)
MLdata.drop('Line',inplace=True,axis=1)

# One-hot encoding for 'Incident' Data
encoder = OneHotEncoder()
OneHotArray = encoder.fit_transform(MLdata[['Incident']]).toarray()
incident = [f'Incident_{item}' for item in encoder.categories_[0]]
MLdata[incident] = OneHotArray

# Remove 'Incident' (object)
MLdata.drop('Incident',inplace=True,axis=1)
MLdata.head()

# Convert dataframe to arrays for machine learning
MLarray = MLdata.values

# Create arrays for predictor and response variables
predictors = MLarray[:,1:30]
response = MLarray[:,0]

# Create training and test data subsets
predictors_train, predictors_test, response_train, response_test = train_test_split(predictors, response, 
    test_size=0.2, random_state=8)

# Create data frame for KNN
MLmodels = []
MLmodels.append(('KNN', KNeighborsClassifier(n_neighbors=133)))

# Run KNN and print accuracy score (and standard deviation)
MLresults = []
MLnames = []

for name, model in MLmodels:
    kfold = KFold(n_splits=10, shuffle=True, random_state=8)
    crossVal = cross_val_score(model, predictors_train, response_train, cv=kfold, scoring='balanced_accuracy')
    MLresults.append(crossVal)
    MLnames.append(name)
    resultsSummary = "%s: %f (%f)" % (name, crossVal.mean(), crossVal.std())
    print(resultsSummary)

KNN: 0.257211 (0.010041)


In [90]:
# Inputting the training data into the best machine learning model
bestAlg = KNeighborsClassifier(n_neighbors=133)
bestAlg.fit(predictors_train, response_train)

KNeighborsClassifier(n_neighbors=133)

In [91]:
# Inputting the test data into the best model
predictions = bestAlg.predict(predictors_test)

In [92]:
# Evaluating the model using the test data
print(balanced_accuracy_score(response_test, predictions))

0.25936616168555976


- The balanced accuracy score to predict 'Min Delay' group on the test dataset was 26%.