In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import preprocessing

In [95]:
next_station_dict = { 
    'Xavier Hall': 'Fine Arts Annex',
    'Fine Arts Annex': 'Loyola House of Studies',
    'Loyola House of Studies': 'Grade School',
    'Grade School': 'Gate 2.5',
    'Gate 2.5': 'Leong Hall',
    'Leong Hall': 'Xavier Hall'
}

In [62]:
df = pd.read_csv('Out.csv')
df = df.drop(columns=['Unnamed: 0', 'Datetime'])

df.dtypes

Altitude                                    float64
Humidity                                    float64
Temperature                                 float64
Time                                         object
Day                                         float64
Hour_of_Day                                 float64
IsEJeep1                                      int64
IsEJeep2                                      int64
IsEJeep3                                      int64
Latitude                                    float64
Longitude                                   float64
Lat Diff                                    float64
Long Diff                                   float64
Distance                                    float64
IsStation                                     int64
IsCharging                                    int64
Cum Distance                                float64
Next Station Lat                            float64
Next Station Long                           float64
Abs Distance

In [63]:
# Convert all the times into a timestamp with 0 = 00:00:00 and 86399 = 23:59:59

def time_to_daytimestamp(stamp: str) -> int:
    if stamp is None:
        return -1
    
    if isinstance(stamp, float):
        return -1
    
    h, m, s = stamp.split(':')
    return int(h) * 24 * 60 + int(m) * 60 + int(s)

df['Time'] = df['Time'].apply(lambda time: time_to_daytimestamp(time))

df['Previous Time to Xavier Hall'] = df['Previous Time to Xavier Hall'].apply(lambda time: time_to_daytimestamp(time))
df['Previous Time to Fine Arts Annex'] = df['Previous Time to Fine Arts Annex'].apply(lambda time: time_to_daytimestamp(time))
df['Previous Time to Loyola House of Studies'] = df['Previous Time to Loyola House of Studies'].apply(lambda time: time_to_daytimestamp(time))
df['Previous Time to Grade School'] = df['Previous Time to Grade School'].apply(lambda time: time_to_daytimestamp(time))
df['Previous Time to Gate 2.5'] = df['Previous Time to Gate 2.5'].apply(lambda time: time_to_daytimestamp(time))
df['Previous Time to Leong Hall'] = df['Previous Time to Leong Hall'].apply(lambda time: time_to_daytimestamp(time))

df['Next Time to Xavier Hall'] = df['Next Time to Xavier Hall'].apply(lambda time: time_to_daytimestamp(time))
df['Next Time to Fine Arts Annex'] = df['Next Time to Fine Arts Annex'].apply(lambda time: time_to_daytimestamp(time))
df['Next Time to Loyola House of Studies'] = df['Next Time to Loyola House of Studies'].apply(lambda time: time_to_daytimestamp(time))
df['Next Time to Grade School'] = df['Next Time to Grade School'].apply(lambda time: time_to_daytimestamp(time))
df['Next Time to Gate 2.5'] = df['Next Time to Gate 2.5'].apply(lambda time: time_to_daytimestamp(time))
df['Next Time to Leong Hall'] = df['Next Time to Leong Hall'].apply(lambda time: time_to_daytimestamp(time))

df

Unnamed: 0,Altitude,Humidity,Temperature,Time,Day,Hour_of_Day,IsEJeep1,IsEJeep2,IsEJeep3,Latitude,...,Next Time to Xavier Hall,Next Time to Fine Arts Annex,Next Time to Loyola House of Studies,Next Time to Grade School,Next Time to Gate 2.5,Next Time to Leong Hall,Encoded Day,Encoded Station,Encoded Prev Station,Encoded Next Station
0,79.0,82.500000,29.299999,25239,23.0,17.0,1,0,0,14.636047,...,9393,11123,25942,26309,25470,25778,1,8,8,1
1,78.7,82.500000,29.299999,25240,23.0,17.0,1,0,0,14.636049,...,9393,11123,25942,26309,26556,25778,1,0,8,1
2,78.4,82.500000,29.299999,25241,23.0,17.0,1,0,0,14.636050,...,9393,11123,25942,26309,26556,25778,1,0,8,1
3,78.0,82.500000,29.299999,25242,23.0,17.0,1,0,0,14.636052,...,9393,11123,25942,26309,26556,25778,1,0,8,1
4,77.7,82.500000,29.299999,25243,23.0,17.0,1,0,0,14.636055,...,9393,11123,25942,26309,26556,25778,1,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482694,75.1,80.400002,30.799999,16163,12.0,9.0,0,1,0,14.638046,...,-1,-1,-1,-1,-1,-1,7,5,4,5
482695,75.5,80.400002,30.799999,16164,12.0,9.0,0,1,0,14.638012,...,-1,-1,-1,-1,-1,-1,7,5,4,5
482696,75.3,80.400002,30.799999,16165,12.0,9.0,0,1,0,14.637973,...,-1,-1,-1,-1,-1,-1,7,5,4,5
482697,74.9,80.400002,30.799999,16166,12.0,9.0,0,1,0,14.637933,...,-1,-1,-1,-1,-1,-1,7,5,4,5


In [94]:
df = df.fillna(-1)
df

df.to_csv('Updated Out.csv')

In [85]:
X = df.copy()
X_train = X[:300_000]
X_test = X[300_001:]

# Xs is X but without the next time columns within it
Xs_train = X_train.copy()
Xs_test = X_test.copy()

Xs_train = Xs_train.drop(columns=['Next Time to Fine Arts Annex', 'Next Time to Loyola House of Studies', 'Next Time to Grade School', 'Next Time to Gate 2.5', 'Next Time to Leong Hall', 'Next Time to Xavier Hall'])
Xs_test = Xs_test.drop(columns=['Next Time to Fine Arts Annex', 'Next Time to Loyola House of Studies', 'Next Time to Grade School', 'Next Time to Gate 2.5', 'Next Time to Leong Hall', 'Next Time to Xavier Hall'])

Y_train = X_train.copy()
Y_test = X_test.copy()

Y_train = Y_train[['Next Time to Fine Arts Annex', 'Next Time to Loyola House of Studies', 'Next Time to Grade School', 'Next Time to Gate 2.5', 'Next Time to Leong Hall', 'Next Time to Xavier Hall', 'IsEJeep1', 'IsEJeep2', 'IsEJeep3']]
Y_train = Y_train.dropna()

Y_test = Y_test[['Next Time to Fine Arts Annex', 'Next Time to Loyola House of Studies', 'Next Time to Grade School', 'Next Time to Gate 2.5', 'Next Time to Leong Hall', 'Next Time to Xavier Hall', 'IsEJeep1', 'IsEJeep2', 'IsEJeep3']]
Y_test = Y_test.dropna()

assert(len(X_train) == len(Y_train))
assert(len(X_test) == len(Y_test))

In [86]:
model_lr = LinearRegression()
model_lr.fit(X_train, Y_train)

model_s_lr = LinearRegression()
model_s_lr.fit(Xs_train, Y_train)

In [87]:
df_output = pd.DataFrame().reindex_like(Y_train)
df_s_output = pd.DataFrame().reindex_like(Y_train)

In [88]:
def daytimestamp_to_time(val: int) -> str:
    h = val // (24 * 60)
    m_s = val % (24 * 60)
    m = m_s // 60
    s = m_s % 60

    return f'{h:02}:{m:02}:{s:02}'

In [91]:
i = 0

for index, row in X_test.iterrows():
    y_hat = model_lr.predict(row.to_frame().transpose())
    df_output.loc[len(df_output)] = y_hat[0]

df_output

Unnamed: 0,Next Time to Fine Arts Annex,Next Time to Loyola House of Studies,Next Time to Grade School,Next Time to Gate 2.5,Next Time to Leong Hall,Next Time to Xavier Hall,IsEJeep1,IsEJeep2,IsEJeep3
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
482693,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-5.192513e-13,1.0,0.0
482694,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-5.048184e-13,1.0,0.0
482695,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-5.072609e-13,1.0,0.0
482696,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-5.090373e-13,1.0,0.0


In [92]:
for index, row in Xs_test.iterrows():
    y_hat = model_s_lr.predict(row.to_frame().transpose())
    df_s_output.loc[len(df_s_output)] = y_hat[0]

df_s_output

Unnamed: 0,Next Time to Fine Arts Annex,Next Time to Loyola House of Studies,Next Time to Grade School,Next Time to Gate 2.5,Next Time to Leong Hall,Next Time to Xavier Hall,IsEJeep1,IsEJeep2,IsEJeep3
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
482693,15622.301539,14924.831221,15822.365085,15293.460109,14858.828837,15235.005924,2.400857e-13,1.0,0.0
482694,15559.146448,14861.820927,15764.883826,15249.423950,14802.529500,15158.555073,2.318701e-13,1.0,0.0
482695,15454.131078,14758.204091,15665.106712,15164.396104,14698.459609,15043.265966,2.240985e-13,1.0,0.0
482696,15394.650005,14701.868831,15610.337574,15112.169982,14638.379933,14977.862324,2.211564e-13,1.0,0.0


In [93]:
df_output.to_csv('Linear Reg Output.csv')
df_s_output.to_csv('Linear Reg Spec Output.csv')

In [97]:
Y_test = Y_test.dropna()
df_output = df_output.dropna()
df_s_output = df_s_output.dropna()

Y_test = Y_test.drop(columns=['IsEJeep1', 'IsEJeep2', 'IsEJeep3'])
df_output = df_output.drop(columns=['IsEJeep1', 'IsEJeep2', 'IsEJeep3'])
df_s_output = df_s_output.drop(columns=['IsEJeep1', 'IsEJeep2', 'IsEJeep3'])

In [98]:
# Calculating MSE for all columns
rmse = mean_squared_error(Y_test, df_output, squared=False)
rmse_s = mean_squared_error(Y_test, df_s_output, squared=False)

mae = mean_absolute_error(Y_test, df_output)
mae_s = mean_absolute_error(Y_test, df_s_output)

r2 = r2_score(Y_test, df_output)
r2_s = r2_score(Y_test, df_s_output)

print('With Next Time of Arrival in Train Dataset:')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')
print(f'MAE: {mae}')

6.39358377905157e-11, 4067.34363732951
