IMPORTING REQUIRED LIBRARIES & MODULES

In [1]:

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import joblib

LOADING THE DATASET

In [2]:
data= pd.read_csv(r'C:\Users\anura\OneDrive\Desktop\-\Space Debris Prediction Model\Dataset\CSV FILE\Total_objects_dataset_Uncleaned_.csv')

In [3]:
data.head(25)

Unnamed: 0,OBJECT_ID,OBJECT_NAME,NORAD_CAT_ID,COUNTRY,PERIOD,INCLINATION,APOGEE,PERIGEE,RCS_SIZE,LAUNCH,COMMENT
0,1998-067WQ,OBJECT WQ,59597,TBD,92.71,51.64,410.0,404.0,,11/20/1998,
1,1998-067WP,OBJECT WP,59596,TBD,92.71,51.64,410.0,404.0,,11/20/1998,
2,2024-078E,CZ-2F DEB,59595,PRC,90.33,41.3,398.0,183.0,,4/25/2024,
3,2024-078D,CZ-2F DEB,59594,PRC,91.27,41.57,487.0,187.0,,4/25/2024,
4,2024-078C,CZ-2F DEB,59593,PRC,91.75,41.61,530.0,191.0,,4/25/2024,
5,2024-078B,CZ-2F R/B,59592,PRC,89.72,41.47,326.0,195.0,LARGE,4/25/2024,
6,2024-078A,SZ-18,59591,PRC,92.15,41.47,383.0,377.0,,4/25/2024,
7,2024-077D,ELECTRON KICK STAGE R/B,59590,US,99.54,97.39,1012.0,461.0,MEDIUM,4/23/2024,
8,2024-077C,ELECTRON R/B,59589,US,89.72,97.39,364.0,158.0,MEDIUM,4/23/2024,
9,2024-077B,ASC 3,59588,US,105.29,97.41,1023.0,993.0,MEDIUM,4/23/2024,


DROPPING NON-REQUIRED COLUMNS/FEATURES

In [4]:
data = data.drop('COMMENT',axis=1)
data = data.drop('OBJECT_ID',axis=1)

IDENTIFY & SEPARATE CATEGORICAL & NUMERICAL FEATURES

In [5]:
data.head()

Unnamed: 0,OBJECT_NAME,NORAD_CAT_ID,COUNTRY,PERIOD,INCLINATION,APOGEE,PERIGEE,RCS_SIZE,LAUNCH
0,OBJECT WQ,59597,TBD,92.71,51.64,410.0,404.0,,11/20/1998
1,OBJECT WP,59596,TBD,92.71,51.64,410.0,404.0,,11/20/1998
2,CZ-2F DEB,59595,PRC,90.33,41.3,398.0,183.0,,4/25/2024
3,CZ-2F DEB,59594,PRC,91.27,41.57,487.0,187.0,,4/25/2024
4,CZ-2F DEB,59593,PRC,91.75,41.61,530.0,191.0,,4/25/2024


In [6]:
# Identify categorical and numerical columns
categorical_features = ['OBJECT_NAME','NORAD_CAT_ID','COUNTRY']  
numerical_features = ['PERIOD', 'INCLINATION', 'APOGEE', 'PERIGEE'] 

# Target column
target = 'RCS_SIZE'

PREPROCESSING THE FEATURES 

In [7]:
# Define the preprocessing for numerical features (impute missing values with KNN)
numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

In [8]:
# Define the preprocessing for categorical features (impute missing values with most frequent and then encode)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [9]:
# Combine preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

LABEL ENCODING

In [10]:
data.head()

Unnamed: 0,OBJECT_NAME,NORAD_CAT_ID,COUNTRY,PERIOD,INCLINATION,APOGEE,PERIGEE,RCS_SIZE,LAUNCH
0,OBJECT WQ,59597,TBD,92.71,51.64,410.0,404.0,,11/20/1998
1,OBJECT WP,59596,TBD,92.71,51.64,410.0,404.0,,11/20/1998
2,CZ-2F DEB,59595,PRC,90.33,41.3,398.0,183.0,,4/25/2024
3,CZ-2F DEB,59594,PRC,91.27,41.57,487.0,187.0,,4/25/2024
4,CZ-2F DEB,59593,PRC,91.75,41.61,530.0,191.0,,4/25/2024


In [11]:
# Initialize the label encoder
label_encoder = LabelEncoder()

# Fit and transform the RCS_SIZE column
data['RCS_SIZE'] = label_encoder.fit_transform(data['RCS_SIZE'].astype(str))

# Display the updated data
print("\nData after converting RCS_SIZE to numerical values:")
print(data.head())


Data after converting RCS_SIZE to numerical values:
  OBJECT_NAME  NORAD_CAT_ID COUNTRY  PERIOD  INCLINATION  APOGEE  PERIGEE  \
0   OBJECT WQ         59597     TBD   92.71        51.64   410.0    404.0   
1   OBJECT WP         59596     TBD   92.71        51.64   410.0    404.0   
2   CZ-2F DEB         59595     PRC   90.33        41.30   398.0    183.0   
3   CZ-2F DEB         59594     PRC   91.27        41.57   487.0    187.0   
4   CZ-2F DEB         59593     PRC   91.75        41.61   530.0    191.0   

   RCS_SIZE      LAUNCH  
0         3  11/20/1998  
1         3  11/20/1998  
2         3   4/25/2024  
3         3   4/25/2024  
4         3   4/25/2024  


In [12]:

# Define the columns used for grouping
grouping_features = categorical_features + numerical_features

# Replace NaNs in numerical features to allow grouping without errors
data[numerical_features] = data[numerical_features].fillna(-999)

# Function to fill 0 values in `RCS_SIZE` with corresponding median
def fill_zeros_with_median(group):
    median_value = group[group['RCS_SIZE'] != 0]['RCS_SIZE'].median()
    group['RCS_SIZE'] = group['RCS_SIZE'].replace(0, median_value)
    return group

# Apply the function to each group defined by the combination of other features
data = data.groupby(grouping_features).apply(fill_zeros_with_median)

# Replace the placeholder -999 with NaN back in numerical features
data[numerical_features] = data[numerical_features].replace(-999, np.nan)

# Display the updated data
print("\nData after filling 0 values in RCS_SIZE with corresponding medians:")
data.head()


Data after filling 0 values in RCS_SIZE with corresponding medians:


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,OBJECT_NAME,NORAD_CAT_ID,COUNTRY,PERIOD,INCLINATION,APOGEE,PERIGEE,RCS_SIZE,LAUNCH
OBJECT_NAME,NORAD_CAT_ID,COUNTRY,PERIOD,INCLINATION,APOGEE,PERIGEE,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3B5GSAT,47961,UK,92.75,97.47,412.0,406.0,8438,3B5GSAT,47961,UK,92.75,97.47,412.0,406.0,2.0,3/22/2021
3CAT-5/A (TYVAK-0161),46292,SPN,94.7,97.35,507.0,502.0,9584,3CAT-5/A (TYVAK-0161),46292,SPN,94.7,97.35,507.0,502.0,1.0,09-03-20
3CAT-5/B (TYVAK-0162),46293,SPN,94.69,97.35,506.0,501.0,9583,3CAT-5/B (TYVAK-0162),46293,SPN,94.69,97.35,506.0,501.0,1.0,09-03-20
A-1 (ASTERIX),1778,FR,106.8,34.26,1634.0,523.0,28357,A-1 (ASTERIX),1778,FR,106.8,34.26,1634.0,523.0,1.0,11/26/1965
A-SEANSAT-PG1,57197,MALA,95.41,97.62,546.0,531.0,2247,A-SEANSAT-PG1,57197,MALA,95.41,97.62,546.0,531.0,1.0,6/27/2023


REMOVING ANY DUPLICATE COLUMNS

In [13]:
# Check for and drop any duplicated columns
data = data.loc[:, ~data.columns.duplicated()]

# Display the updated data
print("\nData after ensuring no duplicate columns:")
print(data.head())
print("\nColumns in the dataset:")
print(data.columns)


Data after ensuring no duplicate columns:
                                                                                              OBJECT_NAME  \
OBJECT_NAME           NORAD_CAT_ID COUNTRY PERIOD INCLINATION APOGEE PERIGEE                                
3B5GSAT               47961        UK      92.75  97.47       412.0  406.0   8438                 3B5GSAT   
3CAT-5/A (TYVAK-0161) 46292        SPN     94.70  97.35       507.0  502.0   9584   3CAT-5/A (TYVAK-0161)   
3CAT-5/B (TYVAK-0162) 46293        SPN     94.69  97.35       506.0  501.0   9583   3CAT-5/B (TYVAK-0162)   
A-1 (ASTERIX)         1778         FR      106.80 34.26       1634.0 523.0   28357          A-1 (ASTERIX)   
A-SEANSAT-PG1         57197        MALA    95.41  97.62       546.0  531.0   2247           A-SEANSAT-PG1   

                                                                                    NORAD_CAT_ID  \
OBJECT_NAME           NORAD_CAT_ID COUNTRY PERIOD INCLINATION APOGEE PERIGEE                 

In [14]:
print(data.head)

<bound method NDFrame.head of                                                                                               OBJECT_NAME  \
OBJECT_NAME           NORAD_CAT_ID COUNTRY PERIOD INCLINATION APOGEE PERIGEE                                
3B5GSAT               47961        UK      92.75  97.47       412.0  406.0   8438                 3B5GSAT   
3CAT-5/A (TYVAK-0161) 46292        SPN     94.70  97.35       507.0  502.0   9584   3CAT-5/A (TYVAK-0161)   
3CAT-5/B (TYVAK-0162) 46293        SPN     94.69  97.35       506.0  501.0   9583   3CAT-5/B (TYVAK-0162)   
A-1 (ASTERIX)         1778         FR      106.80 34.26       1634.0 523.0   28357          A-1 (ASTERIX)   
A-SEANSAT-PG1         57197        MALA    95.41  97.62       546.0  531.0   2247           A-SEANSAT-PG1   
...                                                                                                   ...   
ZY 1                  38038        PRC     100.23 98.50       775.0  764.0   15087                

TRAIN A RANDOM FOREST REGRESSOR 

In [15]:
# Ensure there are no NaN values in the target column
data = data.dropna(subset=[target])

In [16]:
# Separate data into features (X) and target (y)
X = data.drop(columns=[target])
y = data[target]

In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Create the pipeline with preprocessing and the Random Forest Regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("\nModel Mean Squared Error:", mse)



Model Mean Squared Error: 0.13341311572700296
