In [43]:
import pandas as pd
import csv
import numpy as np
import os

# Dictionaries

In [44]:
# Change the values that need to be changed manually here
station_coord_rects = {
    'Xavier Hall': { 
        'Latitude': { 
            'lower': 14.640004,
            'higher': 14.640371
        },
        'Longitude': { 
            'lower': 121.078251,
            'higher': 121.078789
        }
    },
    'Fine Arts Annex': { 
        'Latitude': { 
            'lower': 14.636606,
            'higher': 14.636749
        },
        'Longitude': { 
            'lower': 121.078116,
            'higher': 121.078324
        }
    },
    'Loyola House of Studies': { 
        'Latitude': { 
            'lower': 14.636115,
            'higher': 14.636302
        },
        'Longitude': { 
            'lower': 121.080644,
            'higher': 121.080738
        }
    },
    'Grade School': { 
        'Latitude': { 
            'lower': 14.634715,
            'higher': 14.634715
        },
        'Longitude': { 
            'lower': 121.076171,
            'higher': 121.076326
        }
    },
    'JSEC': { 
        'Latitude': { 
            'lower': 14.637686,
            'higher': 14.637804
        },
        'Longitude': { 
            'lower': 121.076260,
            'higher': 121.076370
        }
    },
    'Gate 2.5': { 
        'Latitude': { 
            'lower': 14.637874,
            'higher': 14.637927
        },
        'Longitude': { 
            'lower': 121.074848,
            'higher': 121.075020
        }
    },
    'Leong Hall': { 
        'Latitude': { 
            'lower': 14.640724,
            'higher': 14.640864
        },
        'Longitude': { 
            'lower': 121.076177,
            'higher': 121.076395
        }
    },
    'Power Station': { 
        'Latitude': { 
            'lower': 14.635504,
            'higher': 14.635692
        },
        'Longitude': { 
            'lower': 121.074559,
            'higher': 121.074674
        }
    },
}

In [45]:
next_station_dict = { 
    'Xavier Hall': 'Fine Arts Annex',
    'Fine Arts Annex': 'Loyola House of Studies',
    'Loyola House of Studies': 'Grade School',
    'Grade School': 'Gate 2.5',
    'Gate 2.5': 'Leong Hall',
    'Leong Hall': 'Xavier Hall'
}

# Functions

In [46]:
# Reformat Lat and Long to be decimal coordinates
def ddmm_mmmm_to_decimal(coord):
    # Assuming coord is in the format 'ddmm.mmmm'
    
    # Extract degrees and decimal minutes
    degrees = coord // 100
    decimal_minutes = coord % 100
    
    # Convert decimal minutes to decimal degrees
    decimal_degrees = degrees + decimal_minutes / 60
    
    return decimal_degrees

In [47]:
# Label stations along the line that the ejeep is in based on coordinates
def coord_to_station(lat, long) -> str:
    stations = station_coord_rects.keys()

    for station in stations:
        lat_bounds, long_bounds = station_coord_rects[station]['Latitude'], station_coord_rects[station]['Longitude']

        if (lat >= lat_bounds['lower'] - 0.0003 and lat <= lat_bounds['higher'] + 0.0003) and (long >= long_bounds['lower'] - 0.0003 and long <= long_bounds['higher'] + 0.0003):
            return station
    else:
        return "En Route"

In [48]:
import math

# Function to calculate distance using Haversine formula
def haversine_distance(lat_diff, lon_diff):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert latitude and longitude differences from degrees to radians
    lat_diff_rad = math.radians(lat_diff)
    lon_diff_rad = math.radians(lon_diff)

    # Haversine formula
    a = math.sin(lat_diff_rad / 2)**2 + math.cos(0) * math.cos(0) * math.sin(lon_diff_rad / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Calculate distance
    distance = R * c

    return distance * 1000

In [49]:
# Make dictionary of all stations & average lat & long
def station_centroids(coord) -> dict:
    new_dict = dict()

    new_dict['En Route'] = None

    for station in station_coord_rects:
        ave = (station_coord_rects[station][coord]['lower'] + station_coord_rects[station][coord]['higher']) / 2

        new_dict[station] = ave
    
    return new_dict

lat_centers = station_centroids('Latitude')
long_centers = station_centroids('Longitude')

lat_centers
long_centers

{'En Route': None,
 'Xavier Hall': 121.07852,
 'Fine Arts Annex': 121.07821999999999,
 'Loyola House of Studies': 121.080691,
 'Grade School': 121.07624849999999,
 'JSEC': 121.076315,
 'Gate 2.5': 121.074934,
 'Leong Hall': 121.07628600000001,
 'Power Station': 121.07461649999999}

In [50]:
def process_station(file_path):
     df = pd.read_csv(file_path)
     df.tail(10)

     df.columns = ['Type', 'Datetime', 'Lat', 'Long', 'Altitude', 'Humidity', 'Temperature']
     df.drop(columns='Type', inplace=True)

     # Drop rows with null at start and end
     df = df.loc[df.notnull().all(axis=1).argmax():]
     id_not_null = df.notnull().all(axis=1).idxmax()

     ids = df[df.notnull().all(axis=1)].index
     last_id = ids[-1]

     df = df.loc[:last_id]
            
     # Fix time to PHT
     log_date = os.path.basename(file_path).split('/')[-1]
     df['Datetime'] = pd.to_datetime(df['Datetime'], format='mixed')
     df['Datetime'] = df['Datetime'].dt.tz_convert('+08:00')
     df['Datetime'] = pd.to_datetime(df['Datetime'].dt.strftime(f"{log_date[5:15]} %H:%M:%S"))
     
     # Extract Time
     df['Time'] = df['Datetime'].dt.time
     
     # Extract Day
     df['Day'] = df['Datetime'].dt.day

     # Add day of week
     df['Day_of_Week'] = df['Datetime'].dt.day_name()
            
     # Add hour of day
     df['Hour_of_Day'] = df['Datetime'].dt.hour
     
     # Get One Hot Encoded E-Jeep ID
     rpi_id = file_path.split('/')[1]
     df['IsEJeep1'] = 0
     df['IsEJeep2'] = 0
     df['IsEJeep3'] = 0
     df[f"IsEJeep{rpi_id[3]}"] = 1

     # Iterpolate values for NaN GPS values
     df['Lat'] = df['Lat'].astype(float).interpolate(method='linear', limit_direction='both') 
     df['Long'] = df['Long'].astype(float).interpolate(method='linear', limit_direction='both') 
     df['Altitude'] = df['Altitude'].astype(float).interpolate(method='linear', limit_direction='both') 
            
     df['Latitude'] = df.apply(lambda row: ddmm_mmmm_to_decimal(row['Lat']), axis=1)
     df['Longitude'] = df.apply(lambda row: ddmm_mmmm_to_decimal(row['Long']), axis=1)

     # Drop old Lat / Long values
     df.drop(columns=['Lat', 'Long'], inplace=True)
            
     df['Station'] = df.apply(lambda row: coord_to_station(row['Latitude'], row['Longitude']), axis=1)
            
     # Calculate speed (noted as distance since it's by second anyway) based on change in coordinates from the previous
     # Helps establish if ejeep is truly within station

     df['Lat Diff'] = df['Latitude'].diff()
     df['Long Diff'] = df['Longitude'].diff()

     df['Distance'] = df.apply(lambda row: haversine_distance(row['Lat Diff'], row['Long Diff']), axis=1)
     
     df['Lat Diff'] = df['Lat Diff'].fillna(0)
     df['Long Diff'] = df['Long Diff'].fillna(0)
     df['Distance'] = df['Distance'].fillna(0)
     
     df['Station'] = df.apply(lambda row: 'En Route' if row['Station'] != 'En Route' and row['Distance'] >= 0.5 else row['Station'], axis=1)
            
     # Add station column that's true if in station, false if en route
     df["IsStation"] = 0
     df.loc[(df["Station"] != "En Route") & (df["Station"] != "Power Station"), "IsStation"] = 1
     
     # Drop rows with before first and last station of each day
     start = df.loc[df.IsStation==1,'IsStation'].index[0] - id_not_null
     end = df.query('IsStation == 1').index.max()+1
     df = df.iloc[start:end]
     
     return df

# Raw Data
* Combine csv files from each rpi into one data frame
* Pre-process raw data

In [51]:
df = process_station('E-Jeep Data/rpi1/data_2023-10-23.csv')
df


  df = pd.read_csv(file_path)


Unnamed: 0,Datetime,Altitude,Humidity,Temperature,Time,Day,Day_of_Week,Hour_of_Day,IsEJeep1,IsEJeep2,IsEJeep3,Latitude,Longitude,Station,Lat Diff,Long Diff,Distance,IsStation
256,2023-10-23 17:12:39,79.0,82.500000,29.299999,17:12:39,23,Monday,17,1,0,0,14.636047,121.080516,Loyola House of Studies,-4.000000e-06,0.000002,0.481845,1
257,2023-10-23 17:12:40,78.7,82.500000,29.299999,17:12:40,23,Monday,17,1,0,0,14.636049,121.080524,En Route,1.666667e-06,0.000008,0.963155,0
258,2023-10-23 17:12:41,78.4,82.500000,29.299999,17:12:41,23,Monday,17,1,0,0,14.636050,121.080532,En Route,1.666667e-06,0.000008,0.890524,0
259,2023-10-23 17:12:42,78.0,82.500000,29.299999,17:12:42,23,Monday,17,1,0,0,14.636052,121.080540,En Route,1.833333e-06,0.000008,0.930693,0
260,2023-10-23 17:12:43,77.7,82.500000,29.299999,17:12:43,23,Monday,17,1,0,0,14.636055,121.080548,En Route,2.500000e-06,0.000009,0.985190,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,2023-10-23 17:37:57,231.0,61.700001,35.200001,17:37:57,23,Monday,17,1,0,0,14.637870,121.075285,En Route,-2.500000e-06,0.000013,1.435640,0
1556,2023-10-23 17:37:58,235.5,61.700001,35.200001,17:37:58,23,Monday,17,1,0,0,14.637872,121.075298,En Route,2.166667e-06,0.000013,1.483757,0
1557,2023-10-23 17:37:59,242.0,61.700001,35.200001,17:37:59,23,Monday,17,1,0,0,14.637876,121.075321,En Route,3.333333e-06,0.000023,2.529192,0
1558,2023-10-23 17:38:00,246.9,61.700001,35.200001,17:38:00,23,Monday,17,1,0,0,14.637876,121.075335,En Route,6.666667e-07,0.000014,1.539982,0


In [52]:
# Create a list to hold the dataframes
df_list = []

for i in range(3):
    i = i + 1
    folder_path = os.path.join('E-Jeep Data', f"rpi{i}")
    all_files = os.listdir(folder_path)
    
    # Filter out non-CSV files
    csv_files = [f for f in all_files if f.endswith('.csv')]
    
    for csv in csv_files:
        file_path = os.path.join(folder_path, csv)
        try:
            # Try reading the file using default UTF-8 encoding
            df = process_station(file_path)
            df_list.append(df)
        except UnicodeDecodeError:
            try:
                # If UTF-8 fails, try reading the file using UTF-16 encoding with tab separator
                df = pd.read_csv(file_path, sep='\t', encoding='utf-16')
                df_list.append(df)
            except Exception as e:
                print(f"Could not read file {csv} because of error: {e}")
        except Exception as e:
            print(f"Could not read file {csv} because of error: {e}")

# Concatenate all data into one DataFrame
df = pd.concat(df_list, ignore_index=True)
df

  df = pd.read_csv(file_path)


Could not read file data_2023-11-13.csv because of error: index 0 is out of bounds for axis 0 with size 0
Could not read file data_2023-11-13_2.csv because of error: index 0 is out of bounds for axis 0 with size 0
Could not read file data_2023-11-14.csv because of error: index 0 is out of bounds for axis 0 with size 0
Could not read file data_2023-11-13.csv because of error: index 0 is out of bounds for axis 0 with size 0
Could not read file data_2023-11-14_3.csv because of error: index 0 is out of bounds for axis 0 with size 0


Unnamed: 0,Datetime,Altitude,Humidity,Temperature,Time,Day,Day_of_Week,Hour_of_Day,IsEJeep1,IsEJeep2,IsEJeep3,Latitude,Longitude,Station,Lat Diff,Long Diff,Distance,IsStation
0,2023-10-23 17:12:39,79.0,82.500000,29.299999,17:12:39,23.0,Monday,17.0,1,0,0,14.636047,121.080516,Loyola House of Studies,-0.000004,0.000002,0.481845,1
1,2023-10-23 17:12:40,78.7,82.500000,29.299999,17:12:40,23.0,Monday,17.0,1,0,0,14.636049,121.080524,En Route,0.000002,0.000008,0.963155,0
2,2023-10-23 17:12:41,78.4,82.500000,29.299999,17:12:41,23.0,Monday,17.0,1,0,0,14.636050,121.080532,En Route,0.000002,0.000008,0.890524,0
3,2023-10-23 17:12:42,78.0,82.500000,29.299999,17:12:42,23.0,Monday,17.0,1,0,0,14.636052,121.080540,En Route,0.000002,0.000008,0.930693,0
4,2023-10-23 17:12:43,77.7,82.500000,29.299999,17:12:43,23.0,Monday,17.0,1,0,0,14.636055,121.080548,En Route,0.000003,0.000009,0.985190,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482694,2023-11-12 09:53:23,75.1,80.400002,30.799999,09:53:23,12.0,Sunday,9.0,0,1,0,14.638046,121.078363,En Route,-0.000027,-0.000016,3.423096,0
482695,2023-11-12 09:53:24,75.5,80.400002,30.799999,09:53:24,12.0,Sunday,9.0,0,1,0,14.638012,121.078345,En Route,-0.000034,-0.000018,4.310547,0
482696,2023-11-12 09:53:25,75.3,80.400002,30.799999,09:53:25,12.0,Sunday,9.0,0,1,0,14.637973,121.078317,En Route,-0.000039,-0.000028,5.338515,0
482697,2023-11-12 09:53:26,74.9,80.400002,30.799999,09:53:26,12.0,Sunday,9.0,0,1,0,14.637933,121.078286,En Route,-0.000040,-0.000031,5.664548,0


# Data Processing

In [53]:
# Add IsCharging column: true if charging, false otherwise
df['IsCharging'] = 0

df.loc[df['Station'] == 'Power Station', "IsCharging"] = 1

In [54]:
# Define Prev Station
def prev_station():
    df['Prev Station'] = df['Station']
    df.loc[df['Station'] == 'En Route', "Prev Station"] = np.nan

    df['Prev Station'] = df['Prev Station'].fillna(method='ffill')
    df.loc[df['Station'] != 'En Route', "Prev Station"] = np.nan
    df['Prev Station'] = df['Prev Station'].fillna(method='ffill')
    df['Prev Station'] = df['Prev Station'].fillna(method='bfill')

prev_station()

In [55]:
# Define Next Station
def next_station():
    df['Next Station'] = df['Station']
    df.loc[df['Station'] == 'En Route', "Next Station"] = np.nan

    df['Next Station'] = df['Next Station'].fillna(method='bfill')
    df.loc[df['Station'] != 'En Route', "Next Station"] = np.nan
    df['Next Station'] = df['Next Station'].fillna(method='bfill')
    df['Next Station'] = df['Next Station'].fillna(method='ffill')
    
next_station()

In [56]:
# Filter out repeated stations

df['Station'] = df.apply(lambda x: x['Next Station'] if x['Station'] == 'En Route' and x['Prev Station'] == x['Next Station'] else x['Station'], axis=1)

prev_station()
next_station()

In [57]:
# Add distances (by road) to next station
df['Cum Distance'] = df[::-1].groupby((df['Station'] != 'En Route').cumsum())['Distance'].cumsum()
df.loc[df['Station'] != 'En Route', 'Cum Distance'] = 0

df.dtypes


Datetime        datetime64[ns]
Altitude               float64
Humidity               float64
Temperature            float64
Time                    object
Day                    float64
Day_of_Week             object
Hour_of_Day            float64
IsEJeep1                 int64
IsEJeep2                 int64
IsEJeep3                 int64
Latitude               float64
Longitude              float64
Station                 object
Lat Diff               float64
Long Diff              float64
Distance               float64
IsStation                int64
IsCharging               int64
Prev Station            object
Next Station            object
Cum Distance           float64
dtype: object

In [58]:
# Add distances (as the bird flies) to next station
df['Next Station Lat'] = df['Next Station'].map(lat_centers)
df['Next Station Long'] = df['Next Station'].map(long_centers)

df['Abs Distance'] = df.apply(lambda row: haversine_distance(row['Next Station Lat'] - row['Latitude'], row['Next Station Long'] - row['Longitude']), axis=1)

In [59]:
# Do percentage of distance remaining to next station

# Calculate the maximum value of C within each group of 'X'
max_distance_per_group = df[df['Station'] == 'En Route'].groupby((df['Station'] != 'En Route').cumsum())['Cum Distance'].transform('max')

# Add column D representing the percentage of each value in column C relative to the maximum value within its group
df['Percent Distance'] = df['Cum Distance'] / max_distance_per_group * 100

df.dtypes


Datetime             datetime64[ns]
Altitude                    float64
Humidity                    float64
Temperature                 float64
Time                         object
Day                         float64
Day_of_Week                  object
Hour_of_Day                 float64
IsEJeep1                      int64
IsEJeep2                      int64
IsEJeep3                      int64
Latitude                    float64
Longitude                   float64
Station                      object
Lat Diff                    float64
Long Diff                   float64
Distance                    float64
IsStation                     int64
IsCharging                    int64
Prev Station                 object
Next Station                 object
Cum Distance                float64
Next Station Lat            float64
Next Station Long           float64
Abs Distance                float64
Percent Distance            float64
dtype: object

In [60]:
# Add distance (as the crow flies) to the previous station
df['Prev Station Lat'] = df['Prev Station'].map(lat_centers)
df['Prev Station Long'] = df['Prev Station'].map(long_centers)

df['Prev Abs Distance'] = df.apply(lambda row: haversine_distance(row['Prev Station Lat'] - row['Latitude'], row['Prev Station Long'] - row['Longitude']), axis=1)

In [61]:
# Add distance (by road) to the previous station
df['Prev Cum Distance'] = df.groupby((df['Station'] != 'En Route').cumsum())['Distance'].cumsum()
df.loc[df['Station'] != 'En Route', 'Prev Cum Distance'] = 0

In [62]:
# Global variables


# Average speed every 10 seconds

# Average, min, max temp & humidity in between stations



# Previous & Next Times

In [63]:
# Create a 2nd df that's a more compiled version of the table for easier look-ups

df2 = pd.DataFrame(columns=['Original Index', 'Station', 'Time'])

next_station = ''

for index, row in df.iterrows():
    if row['Station'] == 'En Route':
        continue

    # Log the first instance of the ejeep getting to the stop
    if row['Station'] == next_station or next_station == '':
        new_row = {
            'Original Index': index,
            'Station': row['Station'],
            'Time': row['Time']
        }
        df2.loc[len(df2)] = new_row

        next_station = next_station_dict[row['Station']]

In [64]:
df2

Unnamed: 0,Original Index,Station,Time
0,0,Loyola House of Studies,17:12:39
1,231,Grade School,17:16:30
2,539,Gate 2.5,17:21:38
3,703,Leong Hall,17:24:22
4,1999,Xavier Hall,06:18:02
...,...,...,...
667,479771,Grade School,09:04:40
668,479926,Gate 2.5,09:07:15
669,480055,Leong Hall,09:09:24
670,480172,Xavier Hall,09:11:21


In [65]:
# This essentially notes when the e-jeep arrives and leaves at the station

df_stations_sum = pd.DataFrame(columns=['Original Index', 'Station', 'Time', 'Status'])

previous_index = None
previous_time = None
current_station = ''

for index, row in df.iterrows():
    if row['Station'] == 'En Route':
        continue

    if row['Station'] != current_station:
        new_row = {
            'Original Index': previous_index,
            'Station': current_station,
            'Time': previous_time,
            'Status': 'Departing'
        }
        df_stations_sum.loc[len(df_stations_sum)] = new_row
        
        new_row = {
            'Original Index': index,
            'Station': row['Station'],
            'Time': previous_time,
            'Status': 'Arriving'
        }
        df_stations_sum.loc[len(df_stations_sum)] = new_row

        previous_index = index
        previous_time = row['Time']
        current_station = row['Station']

df_stations_sum = df_stations_sum.dropna()

In [66]:
df['Previous Time to Xavier Hall'] = None
df['Previous Time to Fine Arts Annex'] = None
df['Previous Time to Loyola House of Studies'] = None
df['Previous Time to Grade School'] = None
df['Previous Time to Gate 2.5'] = None
df['Previous Time to Leong Hall'] = None

df['Next Time to Xavier Hall'] = None
df['Next Time to Fine Arts Annex'] = None
df['Next Time to Loyola House of Studies'] = None
df['Next Time to Grade School'] = None
df['Next Time to Gate 2.5'] = None
df['Next Time to Leong Hall'] = None

In [67]:
df2 = df_stations_sum.copy()

for index, row in df2.iterrows():
    future_df = df2.iloc[index + 1: index + 40]
    prev_df = df2.iloc[index - 40: index - 1]

    if row['Status'] == 'Departing':
        for station in next_station_dict.keys():
            future_df_index = future_df.index.where(future_df['Station'] == station).min()
            if not pd.isnull(future_df_index):
                future_df_index = int(future_df_index)
                df.at[row['Original Index'], f'Next Time to {station}'] = df2.at[future_df_index, 'Time']
    elif row['Status'] == 'Arriving':
        for station in next_station_dict.keys():
            prev_df_index = prev_df.index.where(prev_df['Station'] == station).max()
            if not pd.isnull(prev_df_index):
                prev_df_index = int(prev_df_index)
                df.at[row['Original Index'], f'Previous Time to {station}'] = df2.at[prev_df_index, 'Time']

In [68]:
df = df.dropna(how='all')

In [69]:
for station in next_station_dict.keys():
    df[f'Previous Time to {station}'] = df[f'Previous Time to {station}'].interpolate(method='ffill', limit_direction='forward')
    df[f'Next Time to {station}'] = df[f'Next Time to {station}'].interpolate(method='bfill', limit_direction='backward')

In [70]:
df

Unnamed: 0,Datetime,Altitude,Humidity,Temperature,Time,Day,Day_of_Week,Hour_of_Day,IsEJeep1,IsEJeep2,...,Previous Time to Loyola House of Studies,Previous Time to Grade School,Previous Time to Gate 2.5,Previous Time to Leong Hall,Next Time to Xavier Hall,Next Time to Fine Arts Annex,Next Time to Loyola House of Studies,Next Time to Grade School,Next Time to Gate 2.5,Next Time to Leong Hall
0,2023-10-23 17:12:39,79.0,82.500000,29.299999,17:12:39,23.0,Monday,17.0,1,0,...,,,,,06:12:33,06:41:23,17:24:22,17:30:29,17:16:30,17:21:38
1,2023-10-23 17:12:40,78.7,82.500000,29.299999,17:12:40,23.0,Monday,17.0,1,0,...,,,,,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38
2,2023-10-23 17:12:41,78.4,82.500000,29.299999,17:12:41,23.0,Monday,17.0,1,0,...,,,,,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38
3,2023-10-23 17:12:42,78.0,82.500000,29.299999,17:12:42,23.0,Monday,17.0,1,0,...,,,,,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38
4,2023-10-23 17:12:43,77.7,82.500000,29.299999,17:12:43,23.0,Monday,17.0,1,0,...,,,,,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482694,2023-11-12 09:53:23,75.1,80.400002,30.799999,09:53:23,12.0,Sunday,9.0,0,1,...,09:36:11,09:40:26,09:44:24,09:46:42,,,,,,
482695,2023-11-12 09:53:24,75.5,80.400002,30.799999,09:53:24,12.0,Sunday,9.0,0,1,...,09:36:11,09:40:26,09:44:24,09:46:42,,,,,,
482696,2023-11-12 09:53:25,75.3,80.400002,30.799999,09:53:25,12.0,Sunday,9.0,0,1,...,09:36:11,09:40:26,09:44:24,09:46:42,,,,,,
482697,2023-11-12 09:53:26,74.9,80.400002,30.799999,09:53:26,12.0,Sunday,9.0,0,1,...,09:36:11,09:40:26,09:44:24,09:46:42,,,,,,


# Ordinal Encoding

In [71]:
# Day of Week
df.sort_values('Day_of_Week')
df['Encoded Day'] = pd.factorize(df['Day_of_Week'])[0] + 1

In [72]:
route = ['En Route', 'Grade School', 'JSEC', 'Gate 2.5', 'Leong Hall', 'Xavier Hall', 'University Dormitory', 'Fine Arts Annex', 'Loyola House of Studies', 'Gate 1', 'Power Station']

# Station
df['Encoded Station'] = df['Station'].apply(lambda x: route.index(x))

# Prev Station
df['Encoded Prev Station'] = df['Prev Station'].apply(lambda x: route.index(x))

# Next Station
df['Encoded Next Station'] = df['Next Station'].apply(lambda x: route.index(x))

In [73]:
df.drop(columns='Day_of_Week', inplace=True)
df.drop(columns='Station', inplace=True)
df.drop(columns='Prev Station', inplace=True)
df.drop(columns='Next Station', inplace=True)

In [74]:
# make sure there are no nan values
df.fillna(0)

Unnamed: 0,Datetime,Altitude,Humidity,Temperature,Time,Day,Hour_of_Day,IsEJeep1,IsEJeep2,IsEJeep3,...,Next Time to Xavier Hall,Next Time to Fine Arts Annex,Next Time to Loyola House of Studies,Next Time to Grade School,Next Time to Gate 2.5,Next Time to Leong Hall,Encoded Day,Encoded Station,Encoded Prev Station,Encoded Next Station
0,2023-10-23 17:12:39,79.0,82.500000,29.299999,17:12:39,23.0,17.0,1,0,0,...,06:12:33,06:41:23,17:24:22,17:30:29,17:16:30,17:21:38,1,8,8,1
1,2023-10-23 17:12:40,78.7,82.500000,29.299999,17:12:40,23.0,17.0,1,0,0,...,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38,1,0,8,1
2,2023-10-23 17:12:41,78.4,82.500000,29.299999,17:12:41,23.0,17.0,1,0,0,...,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38,1,0,8,1
3,2023-10-23 17:12:42,78.0,82.500000,29.299999,17:12:42,23.0,17.0,1,0,0,...,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38,1,0,8,1
4,2023-10-23 17:12:43,77.7,82.500000,29.299999,17:12:43,23.0,17.0,1,0,0,...,06:12:33,06:41:23,17:24:22,17:30:29,17:34:36,17:21:38,1,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482694,2023-11-12 09:53:23,75.1,80.400002,30.799999,09:53:23,12.0,9.0,0,1,0,...,0,0,0,0,0,0,7,5,4,5
482695,2023-11-12 09:53:24,75.5,80.400002,30.799999,09:53:24,12.0,9.0,0,1,0,...,0,0,0,0,0,0,7,5,4,5
482696,2023-11-12 09:53:25,75.3,80.400002,30.799999,09:53:25,12.0,9.0,0,1,0,...,0,0,0,0,0,0,7,5,4,5
482697,2023-11-12 09:53:26,74.9,80.400002,30.799999,09:53:26,12.0,9.0,0,1,0,...,0,0,0,0,0,0,7,5,4,5


In [75]:
df.to_csv('Out.csv')