In [268]:
from katabatic.models.TableGAN import TableGANAdapter, TableGAN, preprocess_data, postprocess_data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRegressor
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Initialize the adapter with a specific privacy setting
tablegan_adapter = TableGANAdapter(type='continuous', privacy_setting='high')
data_path = 'data/Satellite/Satellite-Database.csv'
UCSDB = pd.read_csv(data_path, encoding='utf-8')
UCSDB.drop(list(UCSDB.filter(regex='Source|Unnamed')), axis=1, inplace=True)
UCSDB.convert_dtypes()
UCSDB.info()

print(df[:10])
#labelencoder=preprocessing.LabelEncoder()
#df= df.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
#print(df[:10])
UCSDB['Class of Orbit'] = UCSDB['Class of Orbit'].str.upper().astype('category')
UCSDB['Type of Orbit'] = UCSDB['Type of Orbit'].str.upper().astype('category')
UCSDB[['Class of Orbit', 'Type of Orbit']].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7562 entries, 0 to 7561
Data columns (total 28 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Name of Satellite, Alternate Names  7560 non-null   object 
 1   Current Official Name of Satellite  7560 non-null   object 
 2   Country/Org of UN Registry          7559 non-null   object 
 3   Country of Operator/Owner           7560 non-null   object 
 4   Operator/Owner                      7560 non-null   object 
 5   Users                               7560 non-null   object 
 6   Purpose                             7560 non-null   object 
 7   Detailed Purpose                    1254 non-null   object 
 8   Class of Orbit                      7560 non-null   object 
 9   Type of Orbit                       6909 non-null   object 
 10  Longitude of GEO (degrees)          7557 non-null   float64
 11  Perigee (km)                        7553 no

Class of Orbit  Type of Orbit             
LEO             NON-POLAR INCLINED            3942
                SUN-SYNCHRONOUS               1688
                POLAR                         1096
MEO             NON-POLAR INCLINED              98
ELLIPTICAL      MOLNIYA                         23
MEO             EQUATORIAL                      20
LEO             EQUATORIAL                      18
ELLIPTICAL      DEEP HIGHLY ECCENTRIC            9
LEO             ELLIPTICAL                       5
ELLIPTICAL      SUN-SYNCHRONOUS                  4
                NON-POLAR INCLINED               2
LEO             SUN-SYNCHRONOUS NEAR POLAR       2
ELLIPTICAL      CISLUNAR                         1
LEO             RETROGRADE                       1
Name: count, dtype: int64

In [269]:
# remove extraneous spaces that result in multiple copies of a category
print(' Users. Labels before cleaning : {}'.format(len( UCSDB['Users'].unique() ) ) )
print(' Purpose. Labels before cleaning : {}'.format(len( UCSDB['Purpose'].unique() ) ) )  

UCSDB['Users']=UCSDB['Users'].str.strip()
UCSDB['Purpose']=UCSDB['Purpose'].str.strip()

print(' Users. Labels AFTER cleaning : {}'.format(len( UCSDB['Users'].unique() ) ) )
print(' Purpose. Labels AFTER cleaning : {}'.format(len( UCSDB['Purpose'].unique() ) ) ) 

 Users. Labels before cleaning : 21
 Purpose. Labels before cleaning : 32
 Users. Labels AFTER cleaning : 18
 Purpose. Labels AFTER cleaning : 31


In [270]:
# remove extraneous spaces that result in multiple copies of a category
print(' Users. Labels before cleaning : {}'.format(len( UCSDB['Users'].unique() ) ) )
print(' Purpose. Labels before cleaning : {}'.format(len( UCSDB['Purpose'].unique() ) ) )  

UCSDB['Users']=UCSDB['Users'].str.strip()
UCSDB['Purpose']=UCSDB['Purpose'].str.strip()

print(' Users. Labels AFTER cleaning : {}'.format(len( UCSDB['Users'].unique() ) ) )
print(' Purpose. Labels AFTER cleaning : {}'.format(len( UCSDB['Purpose'].unique() ) ) ) 

 Users. Labels before cleaning : 18
 Purpose. Labels before cleaning : 31
 Users. Labels AFTER cleaning : 18
 Purpose. Labels AFTER cleaning : 31


In [271]:
UCSDB['Users'] = UCSDB['Users'].astype('category')
UCSDB['Users'].value_counts()
UCSDB['Purpose']=UCSDB['Purpose'].str.strip()
UCSDB['Purpose'] = UCSDB['Purpose'].str.upper().astype('string')
UCSDB['Purpose'].value_counts()

Purpose
COMMUNICATIONS                                    5514
EARTH OBSERVATION                                 1238
TECHNOLOGY DEVELOPMENT                             372
NAVIGATION/GLOBAL POSITIONING                      142
SPACE SCIENCE                                       99
TECHNOLOGY DEMONSTRATION                            64
EARTH SCIENCE                                       28
SURVEILLANCE                                        20
NAVIGATION/REGIONAL POSITIONING                     13
UNKNOWN                                             10
EARTH OBSERVATION/NAVIGATION                         9
SPACE OBSERVATION                                    9
EARTH OBSERVATION/TECHNOLOGY DEVELOPMENT             7
METEOROLOGICAL                                       6
COMMUNICATIONS/MARITIME TRACKING                     5
COMMUNICATIONS/TECHNOLOGY DEVELOPMENT                4
EARTH/SPACE OBSERVATION                              4
EARTH OBSERVATION/COMMUNICATIONS                     2
MI

In [272]:
UCSDB['Detailed Purpose']=UCSDB['Detailed Purpose'].str.strip()
UCSDB['Detailed Purpose'] = UCSDB['Detailed Purpose'].str.upper().astype('string')
UCSDB['Detailed Purpose'].value_counts()

Detailed Purpose
OPTICAL IMAGING                                          517
ELECTRONIC INTELLIGENCE                                  136
METEOROLOGY, AUTOMATIC IDENTIFICATION SYSTEM (AIS)       136
RADAR IMAGING                                             98
EARTH SCIENCE                                             58
METEOROLOGY                                               51
AUTOMATIC IDENTIFICATION SYSTEM (AIS)                     40
MULTISPECTRAL IMAGING                                     25
HYPERSPECTRAL IMAGING                                     23
EARTH SCIENCE/METEROLOGY                                  18
AMATEUR RADIO                                             15
INTERNET OF THINGS (IOT)                                  15
INFRARED IMAGING                                          14
RADAR IMAGING (SAR)                                       12
MARITIME SURVEILLANCE                                      8
RADAR IMAGING/EARTH SCIENCE                                8
DATA RE

In [273]:
#1. remove/drop rows with nan in the satellite name
UCSDB = UCSDB[ UCSDB['Name of Satellite, Alternate Names'].notna() ]

#2. drop columns with fewer than 5 valid items
UCSDB.dropna( axis='columns', thresh=5, inplace=True)

#3. correct for extra spaces on category columns
UCSDB['Users']=UCSDB['Users'].str.strip()

#4. ensure numeric columns are of the correct type
UCSDB[['Expected Lifetime (yrs.)','Dry Mass (kg.)', 'Launch Mass (kg.)', 'Eccentricity', 'Inclination (degrees)','Period (minutes)', 'Power (watts)']]=\
    UCSDB[['Expected Lifetime (yrs.)','Dry Mass (kg.)', 'Launch Mass (kg.)', 'Eccentricity', 'Inclination (degrees)','Period (minutes)', 'Power (watts)']]\
                                                                                            .apply(pd.to_numeric,errors='coerce')

UCSDB.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7560 entries, 0 to 7559
Data columns (total 28 columns):
 #   Column                              Non-Null Count  Dtype   
---  ------                              --------------  -----   
 0   Name of Satellite, Alternate Names  7560 non-null   object  
 1   Current Official Name of Satellite  7560 non-null   object  
 2   Country/Org of UN Registry          7559 non-null   object  
 3   Country of Operator/Owner           7560 non-null   object  
 4   Operator/Owner                      7560 non-null   object  
 5   Users                               7560 non-null   object  
 6   Purpose                             7560 non-null   string  
 7   Detailed Purpose                    1254 non-null   string  
 8   Class of Orbit                      7560 non-null   category
 9   Type of Orbit                       6909 non-null   category
 10  Longitude of GEO (degrees)          7557 non-null   float64 
 11  Perigee (km)                       

In [274]:
## identify ZERO in the mass or power columns
isZeroPower_idx = UCSDB['Power (watts)'] == 0
isZeroDryMass_idx = UCSDB['Dry Mass (kg.)'] == 0
isZeroLaunchMass_idx = UCSDB['Launch Mass (kg.)'] == 0

print(' Number of entries in columns Power, Launch Mass and Dry Mass set to zero')
print('-------------------------------------------------------------------------')
print(' Power col: {}'.format(isZeroPower_idx.sum()))
print(UCSDB[isZeroPower_idx]['Current Official Name of Satellite'].to_string())
print('-------------------------------------------------------------------------')
print(' Dry Mass col: {}'.format(isZeroDryMass_idx.sum()))
print(' Launch Mass col: {}'.format(isZeroLaunchMass_idx.sum()))

 Number of entries in columns Power, Launch Mass and Dry Mass set to zero
-------------------------------------------------------------------------
 Power col: 1
1865    NSS-6
-------------------------------------------------------------------------
 Dry Mass col: 0
 Launch Mass col: 0


In [275]:
# Set the POWER (Watts) value for NSS-6 to 10000
UCSDB.loc[isZeroPower_idx,'Power (watts)'] = 10000
print(UCSDB[isZeroPower_idx]['Power (watts)'].to_string())

1865    10000.0


In [276]:
# Set the index of the dataframe using the Date of launch column.
parsed_date_of_launch = pd.to_datetime(UCSDB['Date of Launch'], errors='coerce').sort_values()
# determine if there are any data points not correctly formatted
print( parsed_date_of_launch.loc[ np.isnat(parsed_date_of_launch) ] )

2      NaT
3      NaT
5      NaT
7      NaT
8      NaT
        ..
7554   NaT
7555   NaT
7556   NaT
7558   NaT
7559   NaT
Name: Date of Launch, Length: 4940, dtype: datetime64[ns]


In [277]:
UCSDB.loc[ np.isnat(parsed_date_of_launch), ['Name of Satellite, Alternate Names','NORAD Number','Date of Launch'] ]

Unnamed: 0,"Name of Satellite, Alternate Names",NORAD Number,Date of Launch
2,Aalto-1,42775.0,23-06-2017
3,AAt-4,41460.0,25-04-2016
5,ABS-2A,41588.0,15-06-2016
7,"ABS-4 (ABS-2i, MBSat, Mobile Broadcasting Sate...",28184.0,13-03-2004
8,"ABS-6 (ABS-1, LMI-1, Lockheed Martin-Intersput...",25924.0,26-09-1999
...,...,...,...
7554,Zhuhai 1-07 (OHS-4),43443.0,26-04-2018
7555,Ziyuan 1-02C,38038.0,22-12-2011
7556,Ziyuan 1-2D,44528.0,14-09-2019
7558,Ziyuan 3-2,41556.0,29-05-2016


In [278]:
# drop not relevant columns
UCSDB = UCSDB.drop('Name of Satellite, Alternate Names', axis=1)
UCSDB = UCSDB.drop('Current Official Name of Satellite', axis=1)
UCSDB = UCSDB.drop('Country/Org of UN Registry', axis=1)
UCSDB = UCSDB.drop('Comments', axis=1)
UCSDB = UCSDB.drop('Country of Operator/Owner', axis=1)
UCSDB = UCSDB.drop('Operator/Owner', axis=1)
UCSDB = UCSDB.drop('Contractor', axis=1)
UCSDB = UCSDB.drop('Country of Contractor', axis=1)
UCSDB = UCSDB.drop('Launch Site', axis=1)
UCSDB = UCSDB.drop('Launch Vehicle', axis=1)
UCSDB = UCSDB.drop('COSPAR Number', axis=1)
UCSDB = UCSDB.drop('Detailed Purpose', axis=1)
UCSDB = UCSDB.drop('Date of Launch', axis=1)
UCSDB.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7560 entries, 0 to 7559
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   Users                       7560 non-null   object  
 1   Purpose                     7560 non-null   string  
 2   Class of Orbit              7560 non-null   category
 3   Type of Orbit               6909 non-null   category
 4   Longitude of GEO (degrees)  7557 non-null   float64 
 5   Perigee (km)                7553 non-null   object  
 6   Apogee (km)                 7553 non-null   object  
 7   Eccentricity                7549 non-null   float64 
 8   Inclination (degrees)       7556 non-null   float64 
 9   Period (minutes)            7503 non-null   float64 
 10  Launch Mass (kg.)           6483 non-null   float64 
 11  Dry Mass (kg.)              559 non-null    float64 
 12  Power (watts)               228 non-null    float64 
 13  Expected Lifetime (yrs.

In [279]:
UCSDB.iloc[10:]

Unnamed: 0,Users,Purpose,Class of Orbit,Type of Orbit,Longitude of GEO (degrees),Perigee (km),Apogee (km),Eccentricity,Inclination (degrees),Period (minutes),Launch Mass (kg.),Dry Mass (kg.),Power (watts),Expected Lifetime (yrs.),NORAD Number
10,Government,TECHNOLOGY DEVELOPMENT,LEO,SUN-SYNCHRONOUS,0.0,539,599,0.004320,97.50,95.50,9.0,,,,47939.0
11,Government,TECHNOLOGY DEVELOPMENT,LEO,SUN-SYNCHRONOUS,0.0,537,561,0.001730,97.50,95.50,9.0,,,,47943.0
12,Civil,EARTH SCIENCE,LEO,SUN-SYNCHRONOUS,0.0,497,506,0.000655,97.40,95.00,12.0,12.0,,,56187.0
13,Military,EARTH OBSERVATION,GEO,,0.0,35700,35800,0.001190,0.00,1437.60,,,,,47237.0
14,Military,EARTH OBSERVATION,GEO,,-26.0,35560,36013,0.005370,7.72,1436.14,,,,,25336.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,Government,EARTH OBSERVATION,LEO,SUN-SYNCHRONOUS,0.0,763,773,0.000700,98.56,100.20,,,,,38038.0
7556,Government,EARTH OBSERVATION,LEO,SUN-SYNCHRONOUS,0.0,748,758,0.000702,98.50,99.80,,,,5.0,44528.0
7557,Government,EARTH OBSERVATION,LEO,SUN-SYNCHRONOUS,0.0,500,504,0.000291,97.50,94.70,,,,4.0,38046.0
7558,Government,EARTH OBSERVATION,LEO,SUN-SYNCHRONOUS,0.0,487,500,0.000947,97.50,94.49,,,,4.0,41556.0


In [280]:
UCSDB['Type of Orbit'] = UCSDB['Type of Orbit'].fillna("NON-POLAR INCLINED")
UCSDB['Perigee (km)'] = UCSDB['Perigee (km)'].fillna("548")
UCSDB['Apogee (km)'] = UCSDB['Apogee (km)'].fillna("541")

In [281]:
UCSDB= UCSDB.apply(lambda col: col.fillna(0) if col.dtype =='float64' else col)
#print(UCSDB.isnull().sum())
print(UCSDB['Class of Orbit'].value_counts())


Class of Orbit
LEO           6768
GEO            590
MEO            143
ELLIPTICAL      59
Name: count, dtype: int64


In [282]:
labelencoder=preprocessing.LabelEncoder()
UCSDB= UCSDB.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='object' else col)
UCSDB= UCSDB.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='string' else col)
UCSDB= UCSDB.apply(lambda col: labelencoder.fit_transform(col) if col.dtype =='category' else col)
print(UCSDB[:10])

   Users  Purpose  Class of Orbit  Type of Orbit  Longitude of GEO (degrees)  \
0      4        4               2              5                         0.0   
1      4        4               2              8                         0.0   
2      0       27               2              8                         0.0   
3      0        4               2              8                         0.0   
4      4        0               1              5                        75.0   
5      4        0               1              5                       -75.0   
6      4        0               1              5                        -3.0   
7      4        0               1              5                        75.0   
8      4        0               1              5                       159.0   
9      8       27               2              8                         0.0   

   Perigee (km)  Apogee (km)  Eccentricity  Inclination (degrees)  \
0           550          528      0.001510        

In [283]:
x = UCSDB.copy().drop('Class of Orbit', axis=1)
y = UCSDB['Class of Orbit']

x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=42)

In [284]:
tablegan_adapter.fit(x_train, y_train, epochs=100, batch_size=64)

---FIT TableGAN Model with high privacy setting
---Initialise TableGAN Model


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 10/100: [D loss: -0.5960] [G loss: 0.2366] [C loss: 0.0636]
Epoch 20/100: [D loss: -0.4067] [G loss: 0.4017] [C loss: 0.0418]
Epoch 30/100: [D loss: -0.3588] [G loss: 0.5738] [C loss: 0.0284]
Epoch 40/100: [D loss: -0.3393] [G loss: 0.6709] [C loss: 0.0204]
Epoch 50/100: [D loss: -0.3157] [G loss: 0.5736] [C loss: 0.0153]
Epoch 60/100: [D loss: -0.3018] [G loss: 0.5315] [C loss: 0.0108]
Epoch 70/100: [D loss: -0.2781] [G loss: 0.3844] [C loss: 0.0077]
Epoch 80/100: [D loss: -0.2692] [G loss: 0.3063] [C loss: 0.0060]
Epoch 90/100: [D loss: -0.2631] [G loss: 0.3278] [C loss: 0.0043]
Epoch 100/100: [D loss: -0.2604] [G loss: 0.2240] [C loss: 0.0036]


In [285]:
# Generate synthetic data
synthetic_data = tablegan_adapter.generate(size=1000)

---Generate from TableGAN Model


In [286]:
synthetic_df = pd.DataFrame(synthetic_data)
x_sync_train = synthetic_df.drop(synthetic_df.columns[-1],axis=1).values
y_sync_train = synthetic_df.iloc[ :, -1:].values
#print(x_sync_train)
print(synthetic_df.iloc[ :, -1:].value_counts())

14 
2.0    973
0.0     12
1.0     10
3.0      5
Name: count, dtype: int64


In [287]:
# TSTR (train synthetic test real)
tstr_score_lr  = LogisticRegression().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_rf  = RandomForestClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
tstr_score_mlp = MLPClassifier().fit(x_sync_train, y_sync_train).score(x_test, y_test)
xgbt_classifier = XGBClassifier(eval_metric='logloss')
tstr_score_xgbt = xgbt_classifier.fit(x_sync_train, y_sync_train).score(x_test, y_test)


# TRTR (train real test real)
trtr_score_lr  = LogisticRegression().fit(x_train, y_train).score(x_test, y_test)
trtr_score_rf  = RandomForestClassifier().fit(x_train, y_train).score(x_test, y_test)
trtr_score_mlp = MLPClassifier().fit(x_train, y_train).score(x_test, y_test)
xgbt_classifier = XGBClassifier(eval_metric='logloss')
trtr_score_xgbt = xgbt_classifier.fit(x_train, y_train).score(x_test, y_test)
df_evaluate = pd.DataFrame([
    ['TSTR', tstr_score_lr, tstr_score_rf, tstr_score_mlp, tstr_score_xgbt],
    ['TRTR', trtr_score_lr,trtr_score_rf,trtr_score_mlp, trtr_score_xgbt]
], columns=['Evaluated Item', 'LR', 'RF', 'MLP', 'XGBT'])
print(df_evaluate)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


  Evaluated Item        LR        RF       MLP      XGBT
0           TSTR  0.889550  0.884259  0.881614  0.943783
1           TRTR  0.982804  0.999339  0.978836  0.999339
