In [2]:
from katabatic.models.ctabganp import CTABGANPAdapter, CTABGANP, preprocess_data, postprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def preprocessing_raw_df(df):
    df.drop(list(df.filter(regex='Source|Unnamed')), axis=1, inplace=True)
    df.convert_dtypes()
    df['Class of Orbit'] = df['Class of Orbit'].str.upper().astype('category')
    df['Type of Orbit'] = df['Type of Orbit'].str.upper().astype('category')

    # remove extraneous spaces that result in multiple copies of a category
    df['Users'] = df['Users'].str.strip()
    df['Users'] = df['Users'].astype('category')
    df['Purpose'] = df['Purpose'].str.strip()
    df['Purpose'] = df['Purpose'].str.upper().astype('string')
    df['Detailed Purpose'] = df['Detailed Purpose'].str.strip()
    df['Detailed Purpose'] = df['Detailed Purpose'].str.upper().astype('string')

    #1. remove/drop rows with nan in the satellite name
    df = df[ df['Name of Satellite, Alternate Names'].notna() ]

    #2. drop columns with fewer than 5 valid items
    df.dropna( axis='columns', thresh=5, inplace=True)

    #3. correct for extra spaces on category columns
    df['Users']=df['Users'].str.strip()

    #4. ensure numeric columns are of the correct type
    df[['Expected Lifetime (yrs.)','Dry Mass (kg.)', 'Launch Mass (kg.)', 'Eccentricity', 'Inclination (degrees)','Period (minutes)', 'Power (watts)']]=\
        df[['Expected Lifetime (yrs.)','Dry Mass (kg.)', 'Launch Mass (kg.)', 'Eccentricity', 'Inclination (degrees)','Period (minutes)', 'Power (watts)']]\
                                                                                                .apply(pd.to_numeric,errors='coerce')

    ## identify ZERO in the mass or power columns
    isZeroPower_idx = df['Power (watts)'] == 0
    isZeroDryMass_idx = df['Dry Mass (kg.)'] == 0
    isZeroLaunchMass_idx = df['Launch Mass (kg.)'] == 0
    # Set the POWER (Watts) value for NSS-6 to 10000
    df.loc[isZeroPower_idx,'Power (watts)'] = 10000
    # Set the index of the dataframe using the Date of launch column.
    parsed_date_of_launch = pd.to_datetime(df['Date of Launch'], errors='coerce').sort_values()

    # drop not relevant columns
    df = df.drop('Name of Satellite, Alternate Names', axis=1)
    df = df.drop('Current Official Name of Satellite', axis=1)
    df = df.drop('Country/Org of UN Registry', axis=1)
    df = df.drop('Comments', axis=1)
    df = df.drop('Country of Operator/Owner', axis=1)
    df = df.drop('Operator/Owner', axis=1)
    df = df.drop('Contractor', axis=1)
    df = df.drop('Country of Contractor', axis=1)
    df = df.drop('Launch Site', axis=1)
    df = df.drop('Launch Vehicle', axis=1)
    df = df.drop('COSPAR Number', axis=1)
    df = df.drop('Detailed Purpose', axis=1)
    df = df.drop('Date of Launch', axis=1)

    df['Type of Orbit'] = df['Type of Orbit'].fillna("NON-POLAR INCLINED")
    df['Perigee (km)'] = df['Perigee (km)'].fillna("548")
    df['Apogee (km)'] = df['Apogee (km)'].fillna("541")
    df= df.apply(lambda col: col.fillna(0) if col.dtype =='float64' else col)

    labelencoder=preprocessing.LabelEncoder()
    df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
    df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='string' else col)
    df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='category' else col)

    #labelencoder=preprocessing.LabelEncoder()
    #df = df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
    return df

data_path = 'data/Satellite/Satellite-Database.csv'
ctabgan_adapter = CTABGANPAdapter(type='continuous', raw_csv_path = data_path)
ctabgan_adapter.load_model(
                 test_ratio = 0.20,
                 categorical_columns = ["Country/Org of UN Registry", "Purpose", "Class of Orbit", "Type of Orbit"], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = [],
                 non_categorical_columns = [],
                 integer_columns = ["Longitude of GEO (degrees)", "Eccentricity", "Inclination (degrees)", "Expected Lifetime (yrs.)", "NORAD Number"],
                 problem_type= {"Classification": 'Class of Orbit'},
                 preprocessing_raw_df=preprocessing_raw_df) 

df = ctabgan_adapter.raw_df
x_train = ctabgan_adapter.x_train
y_train = ctabgan_adapter.y_train
x_test = ctabgan_adapter.x_test
y_test = ctabgan_adapter.y_test

---Initialise CTABGANP Model


In [3]:
ctabgan_adapter.fit(epochs=100)

100%|██████████| 100/100 [10:22<00:00,  6.23s/it]

Finished training in 630.5645351409912  seconds.





In [4]:
synthetic_df = ctabgan_adapter.generate(size=1000)
for column in synthetic_df.columns:
    print(column, ":", synthetic_df[column].value_counts())

self.batch_size: 1000
Users : Users
4.001351     1
8.074417     1
12.755258    1
4.002914     1
3.995364     1
            ..
8.046991     1
4.002236     1
7.992208     1
3.999274     1
3.998487     1
Name: count, Length: 7560, dtype: int64
Purpose : Purpose
0     3965
4     1280
17     804
22     531
27     514
26     102
11      54
10      40
21      35
25      31
18      30
29      29
28      20
1       19
13      16
14      12
7        9
6        8
2        8
16       8
12       7
15       6
24       6
19       5
5        4
8        4
23       4
20       3
9        3
3        3
Name: count, dtype: int64
Type of Orbit : Type of Orbit
5    3142
6    2380
8    1855
3      70
2      48
4      20
1      18
7      13
9      13
0       1
Name: count, dtype: int64
Longitude of GEO (degrees) : Longitude of GEO (degrees)
 0      7393
 1         9
 127       4
 106       4
 98        4
        ... 
-47        1
-116       1
 41        1
 93        1
 13        1
Name: count, Length: 109, dtyp

In [5]:
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values

In [6]:
y_sync_train = LabelEncoder().fit_transform(y_sync_train)
models = {
    "Logistic Regression": LogisticRegression(max_iter=300, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "MLP Classifier": MLPClassifier(max_iter=300, random_state=42),
    "XGB Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(x_sync_train, y_sync_train)
    y_pred = model.predict(x_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average='macro', zero_division=0),
        "Recall": recall_score(y_test, y_pred, average='macro', zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, average='macro', zero_division=0)
    }
results_df = pd.DataFrame(results).T
print("Classifier Evaluation on Syn Data:")
print(results_df)

Classifier Evaluation on Syn Data:
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.957011   0.542608  0.522609  0.523871
Random Forest        0.988757   0.981701  0.752762  0.792139
MLP Classifier       0.900132   0.478547  0.266949  0.270539
XGB Classifier       0.988095   0.870297  0.773226  0.810059
