# 1. Model Preperation

In [None]:
import boto3
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import statsmodels.api as sm

from sagemaker import get_execution_role
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Connecting to s3
role = get_execution_role()
bucket='sagemaker-studio-520298385440-7in8n1t299'
data_key = 'route_46a.feather'
data_location = 's3://{}/{}'.format(bucket, data_key)

In [None]:
# Loading file
df = pd.read_feather(data_location)

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
# Missing values
df.isnull().sum()

In [None]:
# Unique types for each feature
df.nunique()

In [None]:
# Datatypes and convert
df.dtypes

In [None]:
# Rows and columns
df.shape

**Review so far:**
<br>
There are no more missing values and the constant columns have been removed.
* Remove index.
* Investigate level_0.
* Convert the following to categorical: DAYOFWEEK, MONTHOFSERVICE, PROGRNUMBER, STOPPOINTID, VEHICLEID, IS_HOLIDAY, IS_WEEKDAY.
* We have data for most of the days of the year and for each month.


In [None]:
df = df.drop(['level_0', 'index'], axis=1)

In [None]:
# Sorting by trip then dayofservice
df = df.sort_values(by=['TRIPID', 'DAYOFSERVICE'])

In [None]:
# Creating features
categorical_features = ['DAYOFWEEK', 'MONTHOFSERVICE', 'PROGRNUMBER', 'STOPPOINTID', 'VEHICLEID',
                       'TRIPID', 'IS_HOLIDAY', 'IS_WEEKDAY']

datetime_features = ['DAYOFSERVICE']

numerical_features = ['PLANNEDTIME_ARR', 'ACTUALTIME_ARR', 'PLANNEDTIME_DEP', 'ACTUALTIME_DEP',
                     'DWELLTIME', 'PLANNEDTIME_TRAVEL']

target_feat = 'ACTUALTIME_TRAVEL'

In [None]:
# Converting object to categorical
for column in categorical_features:
    df[column] = df[column].astype('category')

In [None]:
df.dtypes

<br><br>
Setting the target feature as _y and x_ as the remaining features in the dataframe. 
<br><br>

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.head(20)

In [None]:
# Creating y and x axis
target_feature = df['ACTUALTIME_TRAVEL']
y = pd.DataFrame(target_feature)
X = df.drop(['ACTUALTIME_TRAVEL'], axis=1)

# Splitting dataset for train and testing data by 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Printing shape of the new split data
print("The original range is: ",df.shape[0])
print("The training range (70%):\t rows 0 to", round(X_train.shape[0]))
print("The test range (30%): \t rows", round(X_train.shape[0]), "to", round(X_train.shape[0]) + X_test.shape[0])

In [None]:
X_train.head(5)

## 1.2. Plot to compare all features to target feature to help make decisions to keep for the models.

#### Plotting datetime feature against target feature

In [None]:
# Plot datetime feature against target feature
X_train.DAYOFSERVICE = pd.to_numeric(X_train.DAYOFSERVICE)
df_temp = pd.concat([X_train['DAYOFSERVICE'], y_train], axis=1)
correlation_dt = df_temp[['DAYOFSERVICE', 'ACTUALTIME_TRAVEL']].corr(method='pearson')
correlation_dt

In [None]:
fig = plt.figure()
ax = fig.add_subplot
df_temp.plot(kind='scatter', x='DAYOFSERVICE', y='ACTUALTIME_TRAVEL', label = "%.3f" % df_temp[['ACTUALTIME_TRAVEL', 'DAYOFSERVICE']].corr().to_numpy()[0,1], figsize=(15, 8)) 

#### Plotting numerical features against target feature

In [None]:
# # Plot numerical features against target feature

# sns.set(style='white')

# # Calculating the correlation of all pairs of the numerical features
# corr = X_train[numerical_features].corr()

# # Generating a mask for the upper triangle
# mask = np.zeros_like(corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# # Set up matplotlib figure
# f, ax = plt.subplots(figsize=(20,20))

# # Generating a custome color map
# cmap = sns.diverging_palette(220, 10, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, annot=True, mask=mask, cmap=cmap, vmax=1, vmin=-1,
#             square=True, xticklabels=True, yticklabels=True,
#             linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
# plt.yticks(rotation = 0)
# plt.xticks(rotation = 90)

In [None]:
for column in numerical_features:
    df_temp = pd.concat([X_train[column], y_train], axis=1)
    correlation_dt = df_temp[[column, 'ACTUALTIME_TRAVEL']].corr(method='pearson')
    print('\n',correlation_dt)

In [None]:
for column in numerical_features:
    df_temp = pd.concat([X_train[column], y_train], axis=1)
    correlation_dt = df_temp[[column, 'ACTUALTIME_TRAVEL']].corr(method='spearman')
    print('\n',correlation_dt)

In [None]:
for column in numerical_features:
    df_temp = pd.concat([X_train[column], y_train], axis=1)
    fig = plt.figure()
    ax = fig.add_subplot
    df_temp.plot(kind='scatter', x=column, y='ACTUALTIME_TRAVEL', label = "%.3f" % df_temp[['ACTUALTIME_TRAVEL', column]].corr(method='pearson').to_numpy()[0,1], figsize=(12, 8)) 

In [None]:
for column in numerical_features:
    df_temp = pd.concat([X_train[column], y_train], axis=1)
    fig = plt.figure()
    ax = fig.add_subplot
    df_temp.plot(kind='scatter', x=column, y='ACTUALTIME_TRAVEL', label = "%.3f" % df_temp[['ACTUALTIME_TRAVEL', column]].corr(method='spearman').to_numpy()[0,1], figsize=(12, 8)) 

In [None]:
# Plot categorical features against target feature

df_temp = pd.concat([X_train, y_train], axis=1)

for feature in categorical_features:
    
    # Creating a hash table and return as an array
    categ_feat = pd.unique(df_temp[feature].ravel())
    df_temp['percent'] = 0
    
    print("\n", feature)
    print("Index \t\t Count")
    
    for column in categ_feat:
        
        count = df_temp[df_temp[feature] == column].count()['ACTUALTIME_TRAVEL']
        count_percentage = (1/count) * 100
        print(column, "\t", count)
        
        index = df_temp[df_temp[feature] == column].index.tolist()
        for ind in index:
            df_temp.loc[ind, 'percent'] = count_percentage
            
    grouped = df_temp[['percent', feature, 'ACTUALTIME_TRAVEL']].groupby([feature, 'ACTUALTIME_TRAVEL']).sum()
    plot = grouped.unstack().plot(kind='bar', stacked=True, title=f' ACTUALTIME_TRAVEL v {feature}',
                                 figsize=(10,5), grid=True)
    
    # Adding legends, gridlines, labels
    red = mpatches.Patch(color='red', label='Yes')
    blue = mpatches.Patch(color='blue', label='No')
    plot.legend(handles=[red, blue], frameon=True)
    
    plt.grid(b=True, which='minor', color='#666666', linestyle='-')
    plt.minortricks_on()
    plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
    plot.set_xlabel("---")
    plot.set_ylabel("% Death")
    plot.set_ylim([0,100])
    
    
# drop 'percent' that was used only for stacked bar plot
df_temp = df_temp.drop("percent", 1)