# Machine Learning Isaac

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from fastcore.basics import Path, AttrDict
import utils_isaac as utils
import numpy as np
import pickle
from datetime import datetime
from catboost import CatBoostRegressor, CatBoostClassifier
from tqdm import tqdm

# This is used to import the evaluation script, not needed for training
import sys
sys.path.append('../') 
import evaluation

In [2]:
config = AttrDict(
    challenge_data_dir = Path('phase_1_v3'),
    valid_ratio = 0.00001,
    lag_steps = 6,
    tolerance= 6, # Default evaluation tolerance
)

In [3]:
# Define the list of feature columns
feature_cols = [
    "Eccentricity",
    "Semimajor Axis (m)",
    "Inclination (deg)",
    "RAAN (deg)",
    "Argument of Periapsis (deg)",
    "True Anomaly (deg)",
    "Latitude (deg)",
    "Longitude (deg)",
    "Altitude (m)",
    "X (m)",
    "Y (m)",
    "Z (m)",
    "Vx (m/s)",
    "Vy (m/s)",
    "Vz (m/s)"
]

In [6]:
# Define the directory paths
train_data_dir = config.challenge_data_dir / "train"

# Load the ground truth data
ground_truth = pd.read_csv(config.challenge_data_dir / 'train_labels.csv')

# # Apply the function to the ground truth data
data, updated_feature_cols = utils.tabularize_data(train_data_dir,
                                                   feature_cols, 
                                                   ground_truth,
                                                   lag_steps=config.lag_steps,
                                                   add_heurestic=False)

data['EW'] = data['EW'].fillna('Nothing')
data['NS'] = data['NS'].fillna('Nothing')

In [None]:
#data[['ObjectID','Timestamp','EW_baseline_heuristic','NS_baseline_heuristic','EW_baseline_heuristic_ffill','NS_baseline_heuristic_ffill']].to_pickle('Data_With_Baseline.pkl')
#data.to_pickle('Data_With_Nothing.pkl')

In [8]:
data_baseline = pd.read_pickle('Data_With_Nothing.pkl')[['ObjectID','Timestamp','EW_baseline_heuristic',
                                                          'NS_baseline_heuristic','EW_baseline_heuristic_ffill','NS_baseline_heuristic_ffill']]
# data = pd.read_pickle('Data_With_Nothing.pkl')
data = pd.merge(data,data_baseline,on=['ObjectID','Timestamp'],how='left')
data = pd.concat([data,
                  pd.get_dummies(data[['EW_baseline_heuristic']]),
                  pd.get_dummies(data[['NS_baseline_heuristic']]),
                  pd.get_dummies(data[['EW_baseline_heuristic_ffill']]),
                  pd.get_dummies(data[['NS_baseline_heuristic_ffill']])],axis=1)

In [9]:
del data_baseline

In [10]:
data.shape

(4129211, 291)

In [11]:
updated_feature_cols = list(data.columns)
updated_feature_cols.remove('TimeIndex')
updated_feature_cols.remove('Timestamp')
updated_feature_cols.remove('ObjectID')
updated_feature_cols.remove('EW')
updated_feature_cols.remove('NS')
updated_feature_cols.remove('EW_baseline_heuristic')
updated_feature_cols.remove('NS_baseline_heuristic')
updated_feature_cols.remove('EW_baseline_heuristic_ffill')
updated_feature_cols.remove('NS_baseline_heuristic_ffill')

In [12]:
len(updated_feature_cols)

282

In [13]:
# Create a validation set without mixing the ObjectIDs
object_ids = data['ObjectID'].unique()
train_ids, valid_ids = train_test_split(object_ids, 
                                        test_size=config.valid_ratio, 
                                        random_state=43)

train_data = data[data['ObjectID'].isin(train_ids)].copy()
valid_data = data[data['ObjectID'].isin(valid_ids)].copy()

ground_truth_train = ground_truth[ground_truth['ObjectID'].isin(train_ids)].copy()
ground_truth_valid = ground_truth[ground_truth['ObjectID'].isin(valid_ids)].copy()

# Count the number of objects in the training and validation sets
print('Number of objects in the training set:', len(train_data['ObjectID'].unique()))
print('Number of objects in the validation set:', len(valid_data['ObjectID'].unique()))

Number of objects in the training set: 1899
Number of objects in the validation set: 1


Next we will make sure that there every label, both in the direction EW and NS,
is present both in the training and validation partitions

In [14]:
# Get the unique values of EW and NS in train and test data
train_EW = set(train_data['EW'].unique())
train_NS = set(train_data['NS'].unique())
valid_EW = set(valid_data['EW'].unique())
valid_NS = set(valid_data['NS'].unique())

# Get the values of EW and NS that are in test data but not in train data
missing_EW = valid_EW.difference(train_EW)
missing_NS = valid_NS.difference(train_NS)

# Check if all the values in EW are also present in NS
if not set(train_data['EW'].unique()).issubset(set(train_data['NS'].unique())):
    # Get the values of EW that are not present in NS
    missing_EW_NS = set(train_data['EW'].unique()).difference(
        set(train_data['NS'].unique())
    )
else:
    missing_EW_NS = None

# Print the missing values of EW and NS
print("Missing values of EW in test data:", missing_EW)
print("Missing values of NS in test data:", missing_NS)
print("Values of EW not present in NS:", missing_EW_NS)

Missing values of EW in test data: set()
Missing values of NS in test data: set()
Values of EW not present in NS: {'AD-NK'}


In [15]:
# Convert categorical data to numerical data
le_EW = LabelEncoder()
le_NS = LabelEncoder()

# Encode the 'EW' and 'NS' columns
train_data['EW_encoded'] = le_EW.fit_transform(train_data['EW'])
train_data['NS_encoded'] = le_NS.fit_transform(train_data['NS'])

In [16]:
# train_data['EW_baseline_heuristic_encoded'] = le_EW.transform(train_data['EW_baseline_heuristic'])
# train_data['NS_baseline_heuristic_encoded'] = le_NS.transform(train_data['NS_baseline_heuristic'])

In [17]:
# valid_data['EW_baseline_heuristic_encoded'] = le_EW.transform(valid_data['EW_baseline_heuristic'])
# valid_data['NS_baseline_heuristic_encoded'] = le_NS.transform(valid_data['NS_baseline_heuristic'])

In [18]:
# Define the Random Forest model for EW
model_EW = CatBoostClassifier(n_estimators=1000, random_state=42)
# Fit the model to the training data for EW
model_EW.fit(train_data[updated_feature_cols], train_data['EW_encoded'])

# Define the Random Forest model for NS
model_NS = CatBoostClassifier(n_estimators=1000, random_state=42)
# Fit the model to the training data for NS
model_NS.fit(train_data[updated_feature_cols], train_data['NS_encoded'])

Learning rate set to 0.120982
0:	learn: 1.3070274	total: 12s	remaining: 3h 19m 9s
1:	learn: 1.0036203	total: 23.4s	remaining: 3h 15m
2:	learn: 0.8086418	total: 35s	remaining: 3h 13m 45s
3:	learn: 0.6683892	total: 46.7s	remaining: 3h 13m 42s
4:	learn: 0.5613556	total: 58.3s	remaining: 3h 13m 27s
5:	learn: 0.4764166	total: 1m 9s	remaining: 3h 12m 54s
6:	learn: 0.4077725	total: 1m 21s	remaining: 3h 12m 52s
7:	learn: 0.3513578	total: 1m 33s	remaining: 3h 12m 29s
8:	learn: 0.3041160	total: 1m 44s	remaining: 3h 12m 8s
9:	learn: 0.2643815	total: 1m 55s	remaining: 3h 11m 20s
10:	learn: 0.2305753	total: 2m 7s	remaining: 3h 10m 44s
11:	learn: 0.2015312	total: 2m 18s	remaining: 3h 10m 20s
12:	learn: 0.1765694	total: 2m 29s	remaining: 3h 9m 33s
13:	learn: 0.1550179	total: 2m 40s	remaining: 3h 8m 58s
14:	learn: 0.1364846	total: 2m 52s	remaining: 3h 8m 36s
15:	learn: 0.1202175	total: 3m 3s	remaining: 3h 8m 3s
16:	learn: 0.1061558	total: 3m 14s	remaining: 3h 7m 46s
17:	learn: 0.0936355	total: 3m 25s	

<catboost.core.CatBoostClassifier at 0x1d5ce523e80>

In [None]:
def do_prediction(model,data,thresh):
    pred_proba = pd.DataFrame(model.predict(data,prediction_type='Probability'))
    pred = pred_proba.idxmax(1)
    print('Num of ex to cut',sum(pred_proba.max(1)<thresh))
    pred.loc[pred_proba.max(1)<thresh] = pred.value_counts().index[0]
    pred = pred.to_numpy().reshape(-1,1)
    return pred

In [None]:
threshold_ew = 0.1
threshold_ns = 0.1

In [None]:
model_EW = pickle.load(open('trained_model/model_EW.pkl', 'rb'))
model_NS = pickle.load(open('trained_model/model_NS.pkl', 'rb'))

In [None]:
pd.DataFrame(model_EW.feature_importances_,index=model_EW.feature_names_).sort_values(0,ascending=False).head(50)

In [None]:
pd.DataFrame(model_NS.feature_importances_,index=model_NS.feature_names_).sort_values(0,ascending=False).head(50)

In [None]:
# Make predictions on the training data for EW
train_data['Predicted_EW'] = le_EW.inverse_transform(
    #model_EW.predict(train_data[updated_feature_cols])
    do_prediction(model_EW,train_data[updated_feature_cols],threshold_ew)
)

# Make predictions on the validation data for NS
train_data['Predicted_NS'] = le_NS.inverse_transform(
    #model_NS.predict(train_data[updated_feature_cols])
    do_prediction(model_NS,train_data[updated_feature_cols],threshold_ns)
)

train_data['Predicted_EW'] = train_data['Predicted_EW'].mask(train_data['Predicted_EW']=='Nothing').ffill()
train_data['Predicted_NS'] = train_data['Predicted_NS'].mask(train_data['Predicted_NS']=='Nothing').ffill() 

# # Print the first few rows of the test data with predictions for both EW and NS
# train_data[['TimeIndex', 'ObjectID', 'EW', 
#             'Predicted_EW', 'NS', 'Predicted_NS']].groupby('ObjectID').head(3)

In [None]:
train_results = utils.convert_classifier_output(train_data)
train_results.loc[train_results.TimeIndex==0,'Node'] = 'SS'
evaluator = evaluation.NodeDetectionEvaluator(ground_truth_train, train_results, 
                                              tolerance=config.tolerance)
precision, recall, f2, rmse = evaluator.score()
print(f'Precision for the train set: {precision:.2f}')
print(f'Recall for the train set: {recall:.2f}')
print(f'F2 for the train set: {f2:.2f}')
print(f'RMSE for the train set: {rmse:.2f}')

In [None]:
threshold_ew = 0.1
threshold_ns = 0.1

In [None]:
if config.valid_ratio > 0:
    # Make predictions on the validation data for EW
    valid_data['Predicted_EW'] = le_EW.inverse_transform(
        #model_EW.predict(valid_data[updated_feature_cols])
        do_prediction(model_EW,valid_data[updated_feature_cols],threshold_ew)
    )

    # Make predictions on the validation data for NS
    valid_data['Predicted_NS'] = le_NS.inverse_transform(
        #model_NS.predict(valid_data[updated_feature_cols])
        do_prediction(model_NS,valid_data[updated_feature_cols],threshold_ns)
    )
    
    valid_data['Predicted_EW'] = valid_data['Predicted_EW'].mask(valid_data['Predicted_EW']=='Nothing').ffill()
    valid_data['Predicted_NS'] = valid_data['Predicted_NS'].mask(valid_data['Predicted_NS']=='Nothing').ffill()

The `NodeDetectionEvaluator` class in the evaluation module allows not only to
compute the general score for a given dataset, but get evaluations per object, and
even plots that show how the predictions look like in a timeline

In [None]:
if config.valid_ratio > 0:
    valid_results = utils.convert_classifier_output(valid_data)
    valid_results.loc[valid_results.TimeIndex==0,'Node'] = 'SS'

    evaluator = evaluation.NodeDetectionEvaluator(ground_truth_valid, 
                                                  valid_results,
                                                  tolerance=config.tolerance)
precision, recall, f2, rmse = evaluator.score()
print(f'Precision for the validation set: {precision:.2f}')
print(f'Recall for the validation set: {recall:.2f}')
print(f'F2 for the validation set: {f2:.2f}')
print(f'RMSE for the validation set: {rmse:.2f}')

In [None]:
# Plot the evaluation timeline for a random ObjectID from the training set
#evaluator.plot(np.random.choice(train_data['ObjectID'].unique()))

In [None]:
# Loop over the Object IDs in the training set and call the evaluation
# function for each object and aggregate the results
total_tp = 0
total_fp = 0
total_fn = 0
for oid in train_data['ObjectID'].unique():
    tp, fp, fn, gt_object, p_object = evaluator.evaluate(oid)
    total_tp += tp
    total_fp += fp
    total_fn += fn

print(f'Total true positives: {total_tp}')
print(f'Total false positives: {total_fp}')
print(f'Total false negatives: {total_fn}')

In [None]:
# Plot the evaluation timeline for a random ObjectID from the training set
evaluator.plot(13)

In [19]:
# Save the trained random forest models (and label encoders) to disk
# Create the folder trained_model if it doesn't exist
Path('trained_model').mkdir(exist_ok=True)
pickle.dump(model_EW, open('trained_model/model_EW.pkl', 'wb'))
pickle.dump(model_NS, open('trained_model/model_NS.pkl', 'wb'))
pickle.dump(le_EW, open('trained_model/le_EW.pkl', 'wb'))
pickle.dump(le_NS, open('trained_model/le_NS.pkl', 'wb'))