In [1]:
import pandas as pd
import csv
import numpy as np
import os

In [2]:
# Change the values that need to be changed manually here
station_coord_rects = {
    'Xavier': { 
        'Latitude': { 
            'lower': 14.640004,
            'higher': 14.640371
        },
        'Longitude': { 
            'lower': 121.078251,
            'higher': 121.078789
        }
    },
    'Old Comm': { 
        'Latitude': { 
            'lower': 14.636606,
            'higher': 14.636749
        },
        'Longitude': { 
            'lower': 121.078116,
            'higher': 121.078324
        }
    },
    'LST': { 
        'Latitude': { 
            'lower': 14.636115,
            'higher': 14.636302
        },
        'Longitude': { 
            'lower': 121.080644,
            'higher': 121.080738
        }
    },
    'Grade School': { 
        'Latitude': { 
            'lower': 14.634715,
            'higher': 14.634715
        },
        'Longitude': { 
            'lower': 121.076171,
            'higher': 121.076326
        }
    },
    'JSEC': { 
        'Latitude': { 
            'lower': 14.637686,
            'higher': 14.637804
        },
        'Longitude': { 
            'lower': 121.076260,
            'higher': 121.076370
        }
    },
    'Gate 2.5': { 
        'Latitude': { 
            'lower': 14.637874,
            'higher': 14.637927
        },
        'Longitude': { 
            'lower': 121.074848,
            'higher': 121.075020
        }
    },
    'Leong': { 
        'Latitude': { 
            'lower': 14.640724,
            'higher': 14.640864
        },
        'Longitude': { 
            'lower': 121.076177,
            'higher': 121.076395
        }
    },
    'Power Station': { 
        'Latitude': { 
            'lower': 14.635504,
            'higher': 14.635692
        },
        'Longitude': { 
            'lower': 121.074559,
            'higher': 121.074674
        }
    },
}

# Raw Data
* Combine csv files from each rpi into one data frame
* Pre-process raw data

In [3]:
# Reformat Lat and Long to be decimal coordinates
def ddmm_mmmm_to_decimal(coord):
    # Assuming coord is in the format 'ddmm.mmmm'
    
    # Extract degrees and decimal minutes
    degrees = coord // 100
    decimal_minutes = coord % 100
    
    # Convert decimal minutes to decimal degrees
    decimal_degrees = degrees + decimal_minutes / 60
    
    return decimal_degrees

In [4]:
# Label stations along the line that the ejeep is in based on coordinates
def coord_to_station(lat, long) -> str:
    stations = station_coord_rects.keys()

    for station in stations:
        lat_bounds, long_bounds = station_coord_rects[station]['Latitude'], station_coord_rects[station]['Longitude']

        if (lat >= lat_bounds['lower'] - 0.0003 and lat <= lat_bounds['higher'] + 0.0003) and (long >= long_bounds['lower'] - 0.0003 and long <= long_bounds['higher'] + 0.0003):
            return station
    else:
        return "En Route"

In [5]:
import math

# Function to calculate distance using Haversine formula
def haversine_distance(lat_diff, lon_diff):
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert latitude and longitude differences from degrees to radians
    lat_diff_rad = math.radians(lat_diff)
    lon_diff_rad = math.radians(lon_diff)

    # Haversine formula
    a = math.sin(lat_diff_rad / 2)**2 + math.cos(0) * math.cos(0) * math.sin(lon_diff_rad / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Calculate distance
    distance = R * c

    return distance * 1000

In [56]:
# Make dictionary of all stations & average lat & long
def station_centroids(coord) -> dict:
    new_dict = dict()

    new_dict['En Route'] = None

    for station in station_coord_rects:
        ave = (station_coord_rects[station][coord]['lower'] + station_coord_rects[station][coord]['higher']) / 2

        new_dict[station] = ave
    
    return new_dict

lat_centers = station_centroids('Latitude')
long_centers = station_centroids('Longitude')

lat_centers
long_centers

{'En Route': None,
 'Xavier': 121.07852,
 'Old Comm': 121.07821999999999,
 'LST': 121.080691,
 'Grade School': 121.07624849999999,
 'JSEC': 121.076315,
 'Gate 2.5': 121.074934,
 'Leong': 121.07628600000001,
 'Power Station': 121.07461649999999}

In [7]:
def process_station(file_path):
     df = pd.read_csv(file_path)
     df.tail(10)

     df.columns = ['Type', 'Time', 'Lat', 'Long', 'Altitude', 'Humidity', 'Temperature']
     df.drop(columns='Type', inplace=True)

     # Drop rows with null at start and end
     df = df.loc[df.notnull().all(axis=1).argmax():]
     id_not_null = df.notnull().all(axis=1).idxmax()

     ids = df[df.notnull().all(axis=1)].index
     last_id = ids[-1]

     df = df.loc[:last_id]
            
     # Fix time to PHT
     log_date = os.path.basename(file_path).split('/')[-1]
     df['Time'] = pd.to_datetime(df['Time'])
     df['Time'] = df['Time'].dt.tz_convert('+08:00')
     df['Time'] = pd.to_datetime(df['Time'].dt.strftime(f"{log_date[5:15]} %H:%M:%S"))

     # Add day of week
     df['Day_of_Week'] = df['Time'].dt.day_name()
            
     # Add hour of day
     df['Hour_of_Day'] = df['Time'].dt.hour

     # Iterpolate values for NaN GPS values
     df['Lat'] = df['Lat'].astype(float).interpolate(method='linear', limit_direction='both') 
     df['Long'] = df['Long'].astype(float).interpolate(method='linear', limit_direction='both') 
     df['Altitude'] = df['Altitude'].astype(float).interpolate(method='linear', limit_direction='both') 
            
     df['Latitude'] = df.apply(lambda row: ddmm_mmmm_to_decimal(row['Lat']), axis=1)
     df['Longitude'] = df.apply(lambda row: ddmm_mmmm_to_decimal(row['Long']), axis=1)

     # Drop old Lat / Long values
     df.drop(columns=['Lat', 'Long'], inplace=True)
            
     df['Station'] = df.apply(lambda row: coord_to_station(row['Latitude'], row['Longitude']), axis=1)
            
     # Calculate speed (noted as distance since it's by second anyway) based on change in coordinates from the previous
     # Helps establish if ejeep is truly within station

     df['Lat Diff'] = df['Latitude'].diff()
     df['Long Diff'] = df['Longitude'].diff()

     df['Distance'] = df.apply(lambda row: haversine_distance(row['Lat Diff'], row['Long Diff']), axis=1)
     
     df['Lat Diff'] = df['Lat Diff'].fillna(0)
     df['Long Diff'] = df['Long Diff'].fillna(0)
     df['Distance'] = df['Distance'].fillna(0)
     
     df['Station'] = df.apply(lambda row: 'En Route' if row['Station'] != 'En Route' and row['Distance'] >= 0.5 else row['Station'], axis=1)
            
     # Add station column that's true if in station, false if en route
     df["IsStation"] = 0
     df.loc[(df["Station"] != "En Route") & (df["Station"] != "Power Station"), "IsStation"] = 1
     
     # Drop rows with before first and last station of each day
     start = df.loc[df.IsStation==1,'IsStation'].index[0] - id_not_null
     end = df.query('IsStation == 1').index.max()+1
     df = df.iloc[start:end]
     
     return df

In [99]:
df = process_station('E-Jeep Data/rpi1/data_2023-10-23.csv')
df


  df = pd.read_csv(file_path)
  df['Time'] = pd.to_datetime(df['Time'])


Unnamed: 0,Time,Altitude,Humidity,Temperature,Day_of_Week,Hour_of_Day,Latitude,Longitude,Station,Lat Diff,Long Diff,Distance,IsStation
169,2023-10-23 17:10:42,112.5,64.199997,32.799999,Monday,17,14.638039,121.078317,En Route,0.000000e+00,0.000000,0.000000,0
170,2023-10-23 17:10:43,112.4,64.199997,32.799999,Monday,17,14.638016,121.078295,En Route,-2.216667e-05,-0.000022,3.498912,0
171,2023-10-23 17:10:44,112.2,64.199997,32.799999,Monday,17,14.637972,121.078266,En Route,-4.450000e-05,-0.000029,5.896070,0
172,2023-10-23 17:10:45,112.1,81.300003,29.299999,Monday,17,14.637934,121.078235,En Route,-3.800000e-05,-0.000031,5.476593,0
173,2023-10-23 17:10:46,111.9,81.300003,29.299999,Monday,17,14.637892,121.078200,En Route,-4.150000e-05,-0.000035,6.060577,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,2023-10-23 17:37:57,231.0,61.700001,35.200001,Monday,17,14.637870,121.075285,En Route,-2.500000e-06,0.000013,1.435640,0
1556,2023-10-23 17:37:58,235.5,61.700001,35.200001,Monday,17,14.637872,121.075298,En Route,2.166667e-06,0.000013,1.483757,0
1557,2023-10-23 17:37:59,242.0,61.700001,35.200001,Monday,17,14.637876,121.075321,En Route,3.333333e-06,0.000023,2.529192,0
1558,2023-10-23 17:38:00,246.9,61.700001,35.200001,Monday,17,14.637876,121.075335,En Route,6.666667e-07,0.000014,1.539982,0


In [37]:
# Create a list to hold the dataframes
df_list = []

for i in range(3):
    i = i + 1
    folder_path = os.path.join('E-Jeep Data', f"rpi{i}")
    all_files = os.listdir(folder_path)
    
    # Filter out non-CSV files
    csv_files = [f for f in all_files if f.endswith('.csv')]
    
    for csv in csv_files:
        file_path = os.path.join(folder_path, csv)
        try:
            # Try reading the file using default UTF-8 encoding
            df = process_station(file_path)
            df_list.append(df)
        except UnicodeDecodeError:
            try:
                # If UTF-8 fails, try reading the file using UTF-16 encoding with tab separator
                df = pd.read_csv(file_path, sep='\t', encoding='utf-16')
                df_list.append(df)
            except Exception as e:
                print(f"Could not read file {csv} because of error: {e}")
        except Exception as e:
            print(f"Could not read file {csv} because of error: {e}")

# Concatenate all data into one DataFrame
df = pd.concat(df_list, ignore_index=True)
df

  df = pd.read_csv(file_path)
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])


Could not read file data_2023-11-13.csv because of error: index 0 is out of bounds for axis 0 with size 0


  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])


Could not read file data_2023-11-13_2.csv because of error: index 0 is out of bounds for axis 0 with size 0


  df['Time'] = pd.to_datetime(df['Time'])


Could not read file data_2023-11-14.csv because of error: index 0 is out of bounds for axis 0 with size 0


  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])


Could not read file data_2023-11-13.csv because of error: index 0 is out of bounds for axis 0 with size 0
Could not read file data_2023-11-14_3.csv because of error: index 0 is out of bounds for axis 0 with size 0


  df['Time'] = pd.to_datetime(df['Time'])
  df['Time'] = pd.to_datetime(df['Time'])


Unnamed: 0,Time,Altitude,Humidity,Temperature,Day_of_Week,Hour_of_Day,Latitude,Longitude,Station,Lat Diff,Long Diff,Distance,IsStation
0,2023-10-23 17:12:39,79.0,82.500000,29.299999,Monday,17.0,14.636047,121.080516,LST,-0.000004,0.000002,0.481845,1
1,2023-10-23 17:12:40,78.7,82.500000,29.299999,Monday,17.0,14.636049,121.080524,En Route,0.000002,0.000008,0.963155,0
2,2023-10-23 17:12:41,78.4,82.500000,29.299999,Monday,17.0,14.636050,121.080532,En Route,0.000002,0.000008,0.890524,0
3,2023-10-23 17:12:42,78.0,82.500000,29.299999,Monday,17.0,14.636052,121.080540,En Route,0.000002,0.000008,0.930693,0
4,2023-10-23 17:12:43,77.7,82.500000,29.299999,Monday,17.0,14.636055,121.080548,En Route,0.000003,0.000009,0.985190,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
482694,2023-11-12 09:53:23,75.1,80.400002,30.799999,Sunday,9.0,14.638046,121.078363,En Route,-0.000027,-0.000016,3.423096,0
482695,2023-11-12 09:53:24,75.5,80.400002,30.799999,Sunday,9.0,14.638012,121.078345,En Route,-0.000034,-0.000018,4.310547,0
482696,2023-11-12 09:53:25,75.3,80.400002,30.799999,Sunday,9.0,14.637973,121.078317,En Route,-0.000039,-0.000028,5.338515,0
482697,2023-11-12 09:53:26,74.9,80.400002,30.799999,Sunday,9.0,14.637933,121.078286,En Route,-0.000040,-0.000031,5.664548,0


In [38]:
# Add IsCharging column: true if charging, false otherwise
df['IsCharging'] = 0

df.loc[df['Station'] == 'Power Station', "IsCharging"] = 1

In [39]:
# Define Prev Station
def prev_station():
    df['Prev Station'] = df['Station']
    df.loc[df['Station'] == 'En Route', "Prev Station"] = np.nan

    df['Prev Station'] = df['Prev Station'].fillna(method='ffill')
    df.loc[df['Station'] != 'En Route', "Prev Station"] = np.nan
    df['Prev Station'] = df['Prev Station'].fillna(method='ffill')
    df['Prev Station'].iloc[0] = df['Prev Station'].iloc[1]

prev_station()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Prev Station'].iloc[0] = df['Prev Station'].iloc[1]


In [40]:
# Define Next Station
def next_station():
    df['Next Station'] = df['Station']
    df.loc[df['Station'] == 'En Route', "Next Station"] = np.nan

    df['Next Station'] = df['Next Station'].fillna(method='bfill')
    df.loc[df['Station'] != 'En Route', "Next Station"] = np.nan
    df['Next Station'] = df['Next Station'].fillna(method='bfill')
    df['Next Station'].iloc[0] = df['Next Station'].iloc[1]
    
next_station()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Station'].iloc[0] = df['Next Station'].iloc[1]


In [41]:
# Filter out repeated stations

df['Station'] = df.apply(lambda x: x['Next Station'] if x['Station'] == 'En Route' and x['Prev Station'] == x['Next Station'] else x['Station'], axis=1)

prev_station()
next_station()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Prev Station'].iloc[0] = df['Prev Station'].iloc[1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next Station'].iloc[0] = df['Next Station'].iloc[1]


In [42]:
# Add distances (by road) to next station
df['Cum Distance'] = df[::-1].groupby((df['Station'] != 'En Route').cumsum())['Distance'].cumsum()
df.loc[df['Station'] != 'En Route', 'Cum Distance'] = 0

df.dtypes


Time            datetime64[ns]
Altitude               float64
Humidity               float64
Temperature            float64
Day_of_Week             object
Hour_of_Day            float64
Latitude               float64
Longitude              float64
Station                 object
Lat Diff               float64
Long Diff              float64
Distance               float64
IsStation                int64
IsCharging               int64
Prev Station            object
Next Station            object
Cum Distance           float64
dtype: object

In [57]:
# Add distances (as the bird flies) to next station
df['Next Station Lat'] = df['Next Station'].map(lat_centers)
df['Next Station Long'] = df['Next Station'].map(long_centers)

df['Abs Distance'] = df.apply(lambda row: haversine_distance(row['Next Station Lat'] - row['Latitude'], row['Next Station Long'] - row['Longitude']), axis=1)

In [25]:
# Do percentage of distance remaining to next station

# Calculate the maximum value of C within each group of 'X'
max_distance_per_group = df[df['Station'] == 'En Route'].groupby((df['Station'] != 'En Route').cumsum())['Cum Distance'].transform('max')

# Add column D representing the percentage of each value in column C relative to the maximum value within its group
df['Percent Distance'] = df['Cum Distance'] / max_distance_per_group * 100

df.dtypes


Time                 datetime64[ns]
Altitude                    float64
Humidity                    float64
Temperature                 float64
Day_of_Week                  object
Hour_of_Day                 float64
Latitude                    float64
Longitude                   float64
Station                      object
Lat Diff                    float64
Long Diff                   float64
Distance                    float64
IsStation                     int64
Cum Distance                float64
Next Station Lat            float64
Next Station Long           float64
Abs Distance                float64
Percent Distance            float64
dtype: object

In [58]:
# Add distance (as the crow flies) to the previous station
df['Prev Station Lat'] = df['Prev Station'].map(lat_centers)
df['Prev Station Long'] = df['Prev Station'].map(long_centers)

df['Prev Abs Distance'] = df.apply(lambda row: haversine_distance(row['Prev Station Lat'] - row['Latitude'], row['Prev Station Long'] - row['Longitude']), axis=1)

In [26]:
# Add distance (by road) to the previous station
df['Prev Cum Distance'] = df.groupby((df['Station'] != 'En Route').cumsum())['Distance'].cumsum()
df.loc[df['Station'] != 'En Route', 'Prev Cum Distance'] = 0