In [1]:
import pandas as pd
import seaborn as sns
import gc
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
import statsmodels.api as sm
import scipy
import plotly.express as px
from scipy.fft import fft
from scipy.signal import lombscargle
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from pprint import pprint
from scipy import stats
import random
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import ttest_ind
from matplotlib.pyplot import tick_params
from sklearn.impute import SimpleImputer
import xgboost as xgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


pl.disable_string_cache()
pl.Config.set_streaming_chunk_size(10000)

polars.config.Config

In [2]:
# Loading the dataset (mmt.parquet)
columns = [ "Epoch", "range_km", "Mag", "sat_j2000", "obs_j2000", "az_rad", "el_rad", 'phase_angle_rad', 'Channel','Filter', "Track", "epsecs", "Satellite"]

N = 300000000
lf = pl.scan_parquet("mmt.parquet").limit(n=N).select(columns)
print(lf.collect_schema())

Schema({'Epoch': Datetime(time_unit='ms', time_zone='UTC'), 'range_km': Float32, 'Mag': Float32, 'sat_j2000': Array(Float32, shape=(3,)), 'obs_j2000': Array(Float32, shape=(3,)), 'az_rad': Float32, 'el_rad': Float32, 'phase_angle_rad': Float32, 'Channel': UInt8, 'Filter': String, 'Track': UInt32, 'epsecs': Float32, 'Satellite': UInt32})


In [3]:
# Sampling the dataset
sample_rate = 0.01
lf = lf.with_row_index("row_num")

sampled_lf = lf.filter(pl.col("row_num") % int(1/sample_rate) == 0)
sampled_df = sampled_lf.collect()

print(sampled_df.shape)
sampled_df.describe()
sampled_df = sampled_df.to_pandas()

(2786335, 14)


In [4]:
sampled_df_V1 = sampled_df.drop(['Epoch', 'sat_j2000', 'obs_j2000', 'Filter'], axis=1)
X = sampled_df_V1.drop('Mag', axis=1)
y = sampled_df_V1['Mag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [6]:
params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'rmse'
}


num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [7]:
preds = bst.predict(dtest)

# Calculate RMSE
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

print(preds[:10])

RMSE: 0.8344862631575131
[7.2897944 6.0398784 7.7613125 6.8814545 8.07573   8.201612  8.299473
 7.702862  8.295625  7.220976 ]


In [8]:
# Calculate RMSE
mse = mean_squared_error(y_test, preds)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

mag_range = y.max() - y.min()
print(f'Range of Mag: {mag_range}')

rmse_percentage = (rmse / mag_range) * 100
print(f'RMSE as a percentage of the range of Mag: {rmse_percentage:.3f}%')

RMSE: 0.8344862631575131
Range of Mag: 13.680879592895508
RMSE as a percentage of the range of Mag: 6.100%


# Conclusion
The RMSE is 0.8344 and the range is 13.6 which means the RMSE is 6.1% of the range of the target variable 'Mag'. This indicates that the model is making predictions with a reasonable level of accuracy.

#### Though Channel and Track did not have significant correlation with Mag, we can still try to predict the Track using the other features.

In [9]:
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import LabelEncoder

X_channel = sampled_df_V1.drop('Channel', axis=1)
y_channel = sampled_df_V1['Channel']

label_encoder = LabelEncoder()
y_channel_encoded = label_encoder.fit_transform(y_channel)

X_train_channel, X_test_channel, y_train_channel, y_test_channel = train_test_split(
    X_channel, y_channel_encoded, test_size=0.2, random_state=42
)

dtrain_channel = xgb.DMatrix(X_train_channel, label=y_train_channel)
dtest_channel = xgb.DMatrix(X_test_channel, label=y_test_channel)

params_channel = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': len(np.unique(y_channel_encoded)),  # Number of classes
    'max_depth': 6,
    'eta': 0.3,
    'eval_metric': 'merror'
}

num_round = 100
bst_channel = xgb.train(params_channel, dtrain_channel, num_round)

preds_channel = bst_channel.predict(dtest_channel).astype(int)

accuracy_channel = accuracy_score(y_test_channel, preds_channel)
print(f'Accuracy for Channel prediction: {accuracy_channel:.4f}')


Accuracy for Channel prediction: 0.3955


In [10]:
rmse_channel = np.sqrt(mean_squared_error(y_test_channel, preds_channel))
print(f'RMSE for Channel prediction: {rmse_channel:.4f}')

RMSE for Channel prediction: 2.9146
