In [1]:
# Import ignore warnings

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing libraries

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
# Prep data

data = Path('../Resources/train_data_dec_day.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,Actual.Stop.Station,New.Trip.Name,Trip_date,Actual.Station.Dprt.Time,Segment.Direction,Trip.Name,Service.Line,Orig..Station,Dest..Station,Leading.Set.Type,Actual.Station.Dprt.Time.Band,Occupancy_Status,Occupancy_Range
0,Aberdeen,V607-JOINED,1-Dec-18,48:00.0,Down,V607,Hunter Valley,Broadmeadow,Scone,,05:45-05:59,Many Seats Available,Low: 0-399
1,Aberdeen,V622-JOINED,1-Dec-18,47:12.0,Up,V622,Hunter Valley,Scone,Broadmeadow,,06:45-06:59,Many Seats Available,Low: 0-399
2,Aberdeen,V669-JOINED,1-Dec-18,37:48.0,Down,V669,Hunter Valley,Broadmeadow,Scone,,19:30-19:44,Many Seats Available,Low: 0-399
3,Aberdeen,V682-JOINED,1-Dec-18,56:18.0,Up,V682,Hunter Valley,Scone,Broadmeadow,,20:45-20:59,Many Seats Available,Low: 0-399
4,Adamstown,200G-JOINED,1-Dec-18,37:10.0,Up,200G,North,Broadmeadow,Central,H,20:30-20:44,Many Seats Available,Low: 0-399


In [4]:
# Fill NaNs

df.fillna("", inplace=True)

In [5]:
# Create our features

df_features = df.drop(columns="Occupancy_Status")
X = pd.get_dummies(df_features)

# Create our target

y = df.Occupancy_Status

In [6]:
X.describe()

Unnamed: 0,Actual.Stop.Station_Aberdeen,Actual.Stop.Station_Adamstown,Actual.Stop.Station_Albion Park,Actual.Stop.Station_Allawah,Actual.Stop.Station_Arncliffe,Actual.Stop.Station_Artarmon,Actual.Stop.Station_Ashfield,Actual.Stop.Station_Asquith,Actual.Stop.Station_Auburn,Actual.Stop.Station_Austinmer,...,Actual.Station.Dprt.Time.Band_22:15-22:29,Actual.Station.Dprt.Time.Band_22:30-22:44,Actual.Station.Dprt.Time.Band_22:45-22:59,Actual.Station.Dprt.Time.Band_23:00-23:14,Actual.Station.Dprt.Time.Band_23:15-23:29,Actual.Station.Dprt.Time.Band_23:30-23:44,Actual.Station.Dprt.Time.Band_23:45-23:59,Occupancy_Range_High: Standing Room Only,Occupancy_Range_Low: 0-399,Occupancy_Range_Medium: 400-799
count,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,...,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0,38222.0
mean,0.000105,0.00068,0.00102,0.002015,0.002015,0.007352,0.006986,0.001962,0.006096,0.000628,...,0.011407,0.011302,0.010544,0.01057,0.010884,0.010727,0.009837,0.005337,0.938125,0.056538
std,0.01023,0.026073,0.031927,0.044839,0.044839,0.085428,0.083288,0.044254,0.077839,0.025051,...,0.106194,0.105712,0.102141,0.102266,0.103757,0.103015,0.098695,0.072862,0.240932,0.230961
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Check the balance of our target values

y.value_counts()

Many Seats Available    37391
Few Seats Available       764
Standing Room Only         63
                            4
Name: Occupancy_Status, dtype: int64

In [8]:
# Importing sklearn library
## Create X_train, X_test, y_train, y_test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(28666, 8431)

In [9]:
# Create the StandardScaler instance

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [10]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset

X_scaler = scaler.fit(X_train)

In [11]:
# Scale the training and testing data

X_train_scale = X_scaler.transform(X_train)
X_test_scale = X_scaler.transform(X_test)

In [12]:
# Importing Logistic regression library

from sklearn.linear_model import LogisticRegression

## Fitting logisitic Regression to training data

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [13]:
# Importing Balanced Accuracy Score library

from sklearn.metrics import balanced_accuracy_score

## Calculated the balanced accuracy score

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5264328806228955

In [14]:
# Importing confusion matrix library

from sklearn.metrics import confusion_matrix

## Display the confusion matrix

confusion_matrix(y_test, y_pred)

array([[   0,    0,    1,    0],
       [   0,  128,   62,    1],
       [   0,   18, 9330,    0],
       [   0,    9,    0,    7]], dtype=int64)

In [15]:
# Importing Classification Report (imbalanced)

from imblearn.metrics import classification_report_imbalanced

## Print the imbalanced classification report
## Based on y_test and y_pred data

print(classification_report_imbalanced(y_test, y_pred))

                            pre       rec       spe        f1       geo       iba       sup

                           0.00      0.00      1.00      0.00      0.00      0.00         1
 Few Seats Available       0.83      0.67      1.00      0.74      0.82      0.65       191
Many Seats Available       0.99      1.00      0.70      1.00      0.83      0.72      9348
  Standing Room Only       0.88      0.44      1.00      0.58      0.66      0.41        16

         avg / total       0.99      0.99      0.70      0.99      0.83      0.71      9556



In [16]:
# Importing Random Forest Classifier 

from imblearn.ensemble import BalancedRandomForestClassifier

## Resample the training data with the BalancedRandomForestClassifier

brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=1)
brf.fit(X_train, y_train)

BalancedRandomForestClassifier(n_estimators=1000, random_state=1)

In [17]:
# Calculated the balanced accuracy score

y_pred_brf = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred_brf)

0.760265235781319

In [18]:
# Display the confusion matrix

confusion_matrix(y_test, y_pred_brf)

array([[   1,    0,    0,    0],
       [   8,  107,    3,   73],
       [3855,  571, 4495,  427],
       [   0,    0,    0,   16]], dtype=int64)

In [19]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred_brf))

                            pre       rec       spe        f1       geo       iba       sup

                           0.00      1.00      0.60      0.00      0.77      0.62         1
 Few Seats Available       0.16      0.56      0.94      0.25      0.73      0.51       191
Many Seats Available       1.00      0.48      0.99      0.65      0.69      0.45      9348
  Standing Room Only       0.03      1.00      0.95      0.06      0.97      0.95        16

         avg / total       0.98      0.48      0.98      0.64      0.69      0.45      9556



In [20]:
# List the features sorted in descending order by feature importance
## Then displaed the first 10 feature importances

importances = brf.feature_importances_
importances_sort = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sort[:10]

[(0.03995188033459053, 'Occupancy_Range_Low: 0-399'),
 (0.03347435097835181, 'Occupancy_Range_High: Standing Room Only'),
 (0.03172873264772669, 'Leading.Set.Type_'),
 (0.028403315461256084, 'Service.Line_Inner West'),
 (0.025322197243781812, 'Segment.Direction_Up'),
 (0.024261236228345244, 'Leading.Set.Type_A'),
 (0.02401339052371289, 'Segment.Direction_Down'),
 (0.02279254089765435, 'Orig..Station_Central'),
 (0.022488367510228216, 'Occupancy_Range_Medium: 400-799'),
 (0.01955519539687901, 'Dest..Station_Penrith')]