# Machine Learning baseline

In [4]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from fastcore.basics import Path, AttrDict
import numpy as np
import pickle

# This is used to import the evaluation script, not needed for training
import sys
sys.path.append('../') 
# import evaluation


In [5]:
config = AttrDict(
    challenge_data_raw_dir = Path('../../data/1_dataset_raw/'),
    challenge_data_clean_dir = Path('../../data/2_dataset_clean/'),
    challenge_data_processed_dir = Path('../../data/3_dataset_processed/'),
    challenge_data_features_dir = Path('../../data/4_dataset_features/'),
    valid_ratio = 0.1,
    lag_steps = 5,
    tolerance= 6, # Default evaluation tolerance
)

In [6]:
# Define the list of feature columns
feature_cols = [
    "Eccentricity",
    "Semimajor Axis (m)",
    "Inclination (deg)",
    "RAAN (deg)",
    "Argument of Periapsis (deg)",
    "True Anomaly (deg)",
    "Latitude (deg)",
    "Longitude (deg)",
    "Altitude (m)",
    "X (m)",
    "Y (m)",
    "Z (m)",
    "Vx (m/s)",
    "Vy (m/s)",
    "Vz (m/s)"
]

In [8]:
# Define the directory paths
train_data_dir = config.challenge_data_raw_dir / "train"
# Load the ground truth data
ground_truth = pd.read_csv(config.challenge_data_raw_dir / 'train_labels.csv')

In [19]:
ground_truth_545 = ground_truth.query("ObjectID == 545")
ground_truth_545

Unnamed: 0,ObjectID,TimeIndex,Direction,Node,Type
887,545,0,EW,SS,NK
888,545,0,NS,SS,NK
889,545,1282,EW,AD,NK
890,545,1289,EW,IK,EK
891,545,1343,NS,IK,EK
892,545,2171,ES,ES,ES


In [22]:
input_545_path = "/Users/francoisporcher/Downloads/mit-challenge/data/1_dataset_raw/train/545.csv"
df_input_545 = pd.read_csv(input_545_path)

In [23]:
df_input_545

Unnamed: 0,Timestamp,Eccentricity,Semimajor Axis (m),Inclination (deg),RAAN (deg),Argument of Periapsis (deg),True Anomaly (deg),Latitude (deg),Longitude (deg),Altitude (m),X (m),Y (m),Z (m),Vx (m/s),Vy (m/s),Vz (m/s)
0,2022-09-01 00:00:00.000000Z,0.000127,4.216594e+07,0.118065,89.710324,202.940616,216.180960,-0.004704,168.980008,3.579214e+07,-3.608303e+07,2.182526e+07,74580.151140,-1590.894015,-2630.634553,3.250794
1,2022-09-01 02:00:00.000000Z,0.005056,4.238466e+07,0.118038,89.650304,60.229587,29.090881,-0.006828,169.036746,3.581902e+07,-4.219026e+07,7.579635e+05,86926.450274,-62.865297,-3079.607106,0.090788
2,2022-09-01 04:00:00.000000Z,0.005062,4.238501e+07,0.118114,89.518050,60.424355,59.091844,-0.007215,169.018089,3.589587e+07,-3.696135e+07,-2.051694e+07,75836.682875,1480.580493,-2694.723791,-3.098809
3,2022-09-01 06:00:00.000000Z,0.005054,4.238422e+07,0.118364,89.395612,60.350090,89.219001,-0.005842,168.866409,3.600208e+07,-2.184975e+07,-3.631348e+07,44344.181267,2619.892981,-1594.478385,-5.446713
4,2022-09-01 08:00:00.000000Z,0.005044,4.238313e+07,0.118658,89.353924,60.060612,119.326647,-0.003103,168.560755,3.610888e+07,-9.333845e+05,-4.247677e+07,940.970165,3058.128946,-80.689078,-6.334788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2167,2023-02-28 14:00:00.000000Z,0.000038,4.216558e+07,0.416532,88.646597,156.947148,351.227505,0.148227,-131.015447,3.578587e+07,-2.307379e+07,-3.528991e+07,161639.665534,2573.383850,-1682.634702,-18.992175
2168,2023-02-28 16:00:00.000000Z,0.000045,4.216583e+07,0.416949,88.629789,176.429868,1.844575,0.003766,-131.014910,3.578581e+07,-2.277033e+06,-4.210242e+07,9239.296098,3070.168255,-166.053808,-22.364934
2169,2023-02-28 18:00:00.000000Z,0.000044,4.216589e+07,0.415641,88.591980,191.957018,16.438220,-0.141445,-131.014378,3.578596e+07,1.913319e+07,-3.757275e+07,-145456.019764,2739.860130,1395.253462,-19.621400
2170,2023-02-28 20:00:00.000000Z,0.000033,4.216562e+07,0.415933,88.629053,204.731687,33.709047,-0.248343,-131.013951,3.578633e+07,3.538909e+07,-2.292130e+07,-260815.457099,1671.460080,2580.661583,-11.682303


In [24]:
import pandas as pd
from fastcore.basics import Path

In [25]:
def merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=10):
    merged_data = pd.DataFrame()
    test_data = Path(train_data_dir).glob('*.csv')


    number_of_files_processed = 0
    for data_file in test_data:
        number_of_files_processed += 1
        if number_of_files_processed > number_of_files_to_process:
            break
        
        data_df = pd.read_csv(data_file)
        data_df['ObjectID'] = int(data_file.stem)
        data_df['TimeIndex'] = range(len(data_df))
        
        ground_truth_object = ground_truth[ground_truth['ObjectID'] == data_df['ObjectID'][0]].copy()
        # Separate the 'EW' and 'NS' types in the ground truth
        ground_truth_EW = ground_truth_object[ground_truth_object['Direction'] == 'EW'].copy()
        ground_truth_NS = ground_truth_object[ground_truth_object['Direction'] == 'NS'].copy()
        
        # Create 'EW' and 'NS' labels and fill 'unknown' values
        ground_truth_EW['EW'] = ground_truth_EW['Node'] + '-' + ground_truth_EW['Type']
        ground_truth_NS['NS'] = ground_truth_NS['Node'] + '-' + ground_truth_NS['Type']
        ground_truth_EW.drop(['Node', 'Type', 'Direction'], axis=1, inplace=True)
        ground_truth_NS.drop(['Node', 'Type', 'Direction'], axis=1, inplace=True)
        
        # Merge the input data with the ground truth
        merged_df = pd.merge(data_df, 
                            ground_truth_EW.sort_values('TimeIndex'), 
                            on=['TimeIndex', 'ObjectID'],
                            how='left')
        
        merged_df = pd.merge_ordered(merged_df, 
                                    ground_truth_NS.sort_values('TimeIndex'), 
                                    on=['TimeIndex', 'ObjectID'],
                                    how='left')

                
        merged_data = pd.concat([merged_data, merged_df])
        
    return merged_data


In [15]:
df_merged_small = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=500)

In [27]:
df_merged_small_sample = df_merged_small.query("ObjectID == 545")

In [31]:
# drop column
df_merged_small_sample.drop('EW', axis=1).dropna()

Unnamed: 0,Timestamp,Eccentricity,Semimajor Axis (m),Inclination (deg),RAAN (deg),Argument of Periapsis (deg),True Anomaly (deg),Latitude (deg),Longitude (deg),Altitude (m),X (m),Y (m),Z (m),Vx (m/s),Vy (m/s),Vz (m/s),ObjectID,TimeIndex,NS
0,2022-09-01 00:00:00.000000Z,0.000127,42165940.0,0.118065,89.710324,202.940616,216.18096,-0.004704,168.980008,35792140.0,-36083030.0,21825260.0,74580.15114,-1590.894015,-2630.634553,3.250794,545,0,SS-NK
1343,2022-12-21 22:00:00.000000Z,3.9e-05,42166010.0,0.392923,86.285362,73.271121,129.5889,-0.112588,-131.013161,35788920.0,13828990.0,-39834740.0,-112337.519833,2904.427348,1008.255377,-19.428488,545,1343,IK-EK


In [None]:

df_merged_medium = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=1000)
df_merged_large = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=1500)
df_merged_full = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=2000)



In [8]:
config.challenge_data_clean_dir.mkdir(parents=True, exist_ok=True)

def export_df_to_csv(df, name):
    df.to_csv(name, index=False)
    
export_df_to_csv(df_merged_small, config.challenge_data_clean_dir / 'df_merged_small.csv')
export_df_to_csv(df_merged_medium, config.challenge_data_clean_dir / 'df_merged_medium.csv')
export_df_to_csv(df_merged_large, config.challenge_data_clean_dir / 'df_merged_large.csv')
export_df_to_csv(df_merged_full, config.challenge_data_clean_dir / 'df_merged_full.csv')
    
