# Machine Learning baseline

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from fastcore.basics import Path, AttrDict
import numpy as np
import pickle

# This is used to import the evaluation script, not needed for training
import sys
sys.path.append('../') 
# import evaluation


In [2]:
config = AttrDict(
    challenge_data_raw_dir = Path('../../data/1_dataset_raw/'),
    challenge_data_clean_dir = Path('../../data/2_dataset_clean/'),
    challenge_data_processed_dir = Path('../../data/3_dataset_processed/'),
    challenge_data_features_dir = Path('../../data/4_dataset_features/'),
    valid_ratio = 0.1,
    lag_steps = 5,
    tolerance= 6, # Default evaluation tolerance
)

In [3]:
# Define the list of feature columns
feature_cols = [
    "Eccentricity",
    "Semimajor Axis (m)",
    "Inclination (deg)",
    "RAAN (deg)",
    "Argument of Periapsis (deg)",
    "True Anomaly (deg)",
    "Latitude (deg)",
    "Longitude (deg)",
    "Altitude (m)",
    "X (m)",
    "Y (m)",
    "Z (m)",
    "Vx (m/s)",
    "Vy (m/s)",
    "Vz (m/s)"
]

In [4]:
# Define the directory paths
train_data_dir = config.challenge_data_raw_dir / "train"
# Load the ground truth data
ground_truth = pd.read_csv(config.challenge_data_raw_dir / 'train_labels.csv')


In [5]:
import pandas as pd
from fastcore.basics import Path

In [6]:
def merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=10):
    merged_data = pd.DataFrame()
    test_data = Path(train_data_dir).glob('*.csv')


    number_of_files_processed = 0
    for data_file in test_data:
        number_of_files_processed += 1
        if number_of_files_processed > number_of_files_to_process:
            break
        
        data_df = pd.read_csv(data_file)
        data_df['ObjectID'] = int(data_file.stem)
        data_df['TimeIndex'] = range(len(data_df))
        
        ground_truth_object = ground_truth[ground_truth['ObjectID'] == data_df['ObjectID'][0]].copy()
        # Separate the 'EW' and 'NS' types in the ground truth
        ground_truth_EW = ground_truth_object[ground_truth_object['Direction'] == 'EW'].copy()
        ground_truth_NS = ground_truth_object[ground_truth_object['Direction'] == 'NS'].copy()
        
        # Create 'EW' and 'NS' labels and fill 'unknown' values
        ground_truth_EW['EW'] = ground_truth_EW['Node'] + '-' + ground_truth_EW['Type']
        ground_truth_NS['NS'] = ground_truth_NS['Node'] + '-' + ground_truth_NS['Type']
        ground_truth_EW.drop(['Node', 'Type', 'Direction'], axis=1, inplace=True)
        ground_truth_NS.drop(['Node', 'Type', 'Direction'], axis=1, inplace=True)
        
        # Merge the input data with the ground truth
        merged_df = pd.merge(data_df, 
                            ground_truth_EW.sort_values('TimeIndex'), 
                            on=['TimeIndex', 'ObjectID'],
                            how='left')
        
        merged_df = pd.merge_ordered(merged_df, 
                                    ground_truth_NS.sort_values('TimeIndex'), 
                                    on=['TimeIndex', 'ObjectID'],
                                    how='left')

                
        merged_data = pd.concat([merged_data, merged_df])
        
    return merged_data
    
df_merged_small = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=500)
df_merged_medium = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=1000)
df_merged_large = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=1500)
df_merged_full = merge_ground_truth_with_trajectories(train_data_dir, ground_truth, number_of_files_to_process=2000)



In [8]:
config.challenge_data_clean_dir.mkdir(parents=True, exist_ok=True)

def export_df_to_csv(df, name):
    df.to_csv(name, index=False)
    
export_df_to_csv(df_merged_small, config.challenge_data_clean_dir / 'df_merged_small.csv')
export_df_to_csv(df_merged_medium, config.challenge_data_clean_dir / 'df_merged_medium.csv')
export_df_to_csv(df_merged_large, config.challenge_data_clean_dir / 'df_merged_large.csv')
export_df_to_csv(df_merged_full, config.challenge_data_clean_dir / 'df_merged_full.csv')
    
