In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
file_path = "test_data.csv"
df_rf = pd.read_csv(file_path)

In [5]:
df_rf.head()

Unnamed: 0,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter(km),extent(km),...,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,data_arc,n_obs_used,condition_code,rms(arcsec)
0,1000004,85P/Boethin,85P,Boethin,P,Y,,,,,...,0.43,0.035,0.00048,8.4,25.0,JFc,4042.0,51,8.0,1.6569
1,1000009,23P/Brorsen-Metcalf,23P,Brorsen-Metcalf,P,Y,,,,,...,1e-05,0.00083,5.2e-07,2.8e-05,0.95,HTC,51904.0,223,0.0,1.2347
2,1000012,67P/Churyumov-Gerasimenko,67P,Churyumov-Gerasimenko,P,Y,,,3.4,,...,3e-06,8e-08,3.2e-09,2.1e-05,5e-05,JFc,3547.0,5605,0.0,0.57366
3,1000016,27P/Crommelin,27P,Crommelin,P,Y,,,,,...,0.076,0.043,0.00013,0.0047,38.0,HTC,259.0,154,6.0,0.70281
4,1000020,72P/Denning-Fujikawa,72P,Denning-Fujikawa,P,Y,,,,,...,1.1e-05,1.5e-06,3.1e-08,8.2e-05,0.00095,JFc,13080.0,44,1.0,1.0399


In [6]:
df_rf = df_rf.dropna(subset=['pha'])
df_rf.head()

Unnamed: 0,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter(km),extent(km),...,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,data_arc,n_obs_used,condition_code,rms(arcsec)
192,2000433,433 Eros (A898 PA),433,Eros,,Y,N,10.43,16.84,34.4x11.2x11.2,...,1e-06,1.9e-10,9.1e-11,3e-06,1e-07,AMO,46582.0,9130,0.0,0.29796
193,2000719,719 Albert (A911 TB),719,Albert,,Y,N,15.51,,,...,3e-06,3.9e-09,3.3e-10,1.2e-05,2.3e-06,AMO,40008.0,1894,0.0,0.39775
194,2000887,887 Alinda (A918 AA),887,Alinda,,Y,N,13.87,4.2,,...,4e-06,4.7e-09,4.6e-10,1.4e-05,2.6e-06,AMO,37799.0,2624,0.0,0.39776
195,2001036,1036 Ganymed (A924 UB),1036,Ganymed,,Y,N,9.25,37.675,,...,2e-06,2.9e-09,2.4e-10,6e-06,1.7e-06,AMO,35482.0,7159,0.0,0.37075
196,2001221,1221 Amor (1932 EA1),1221,Amor,,Y,N,17.39,1.0,,...,8e-06,1.3e-09,2.7e-10,2.1e-05,7e-07,AMO,32775.0,573,0.0,0.45902


In [7]:
df_rf.value_counts(['pha'])

pha
N      25614
Y       2213
dtype: int64

In [8]:
df_rf.value_counts(['neo'])

neo
Y      27827
dtype: int64

In [9]:
# transform pha to 1 or 0
def change(type):
    if type == "Y":
        return 1
    else:
        return 0

In [10]:
df_rf["pha"] = df_rf["pha"].apply(change)
df_rf.head()

Unnamed: 0,spkid,full_name,pdes,name,prefix,neo,pha,H,diameter(km),extent(km),...,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,data_arc,n_obs_used,condition_code,rms(arcsec)
192,2000433,433 Eros (A898 PA),433,Eros,,Y,0,10.43,16.84,34.4x11.2x11.2,...,1e-06,1.9e-10,9.1e-11,3e-06,1e-07,AMO,46582.0,9130,0.0,0.29796
193,2000719,719 Albert (A911 TB),719,Albert,,Y,0,15.51,,,...,3e-06,3.9e-09,3.3e-10,1.2e-05,2.3e-06,AMO,40008.0,1894,0.0,0.39775
194,2000887,887 Alinda (A918 AA),887,Alinda,,Y,0,13.87,4.2,,...,4e-06,4.7e-09,4.6e-10,1.4e-05,2.6e-06,AMO,37799.0,2624,0.0,0.39776
195,2001036,1036 Ganymed (A924 UB),1036,Ganymed,,Y,0,9.25,37.675,,...,2e-06,2.9e-09,2.4e-10,6e-06,1.7e-06,AMO,35482.0,7159,0.0,0.37075
196,2001221,1221 Amor (1932 EA1),1221,Amor,,Y,0,17.39,1.0,,...,8e-06,1.3e-09,2.7e-10,2.1e-05,7e-07,AMO,32775.0,573,0.0,0.45902


In [31]:
df_rf.columns

Index(['spkid', 'full_name', 'pdes', 'name', 'prefix', 'neo', 'pha', 'H',
       'diameter(km)', 'extent(km)', 'albedo', 'rot_per', 'eccentricity',
       'semi_major_axis(au)', 'q(au)', 'inclination(deg)',
       'long_ascending_node(deg)', 'argument_perihelion(deg)',
       'mean_anomaly(deg)', 'aphelion_distance(au)', 'mean_motion(dge/d)',
       'per(d)', 'per.y(year)', 'moid(au)', 'moid.ld(LD)', 'moid_jup(au)',
       't_jup', 'sigma_e', 'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om',
       'sigma_w', 'sigma_ma', 'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per',
       'class', 'data_arc', 'n_obs_used', 'condition_code', 'rms(arcsec)'],
      dtype='object')

In [32]:
# create several csv's, drop not needed columns
df = df_rf.drop(columns=['pdes', 'full_name', 'name', 'prefix', 'neo'])
df

Unnamed: 0,spkid,pha,H,diameter(km),extent(km),albedo,rot_per,eccentricity,semi_major_axis(au),q(au),...,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,class,data_arc,n_obs_used,condition_code,rms(arcsec)
192,2000433,0,10.43,16.840,34.4x11.2x11.2,0.250,5.270,0.2227,1.4580,1.133,...,0.000001,1.900000e-10,9.100000e-11,0.000003,1.000000e-07,AMO,46582.0,9130,0.0,0.29796
193,2000719,0,15.51,,,,5.801,0.5470,2.6380,1.195,...,0.000003,3.900000e-09,3.300000e-10,0.000012,2.300000e-06,AMO,40008.0,1894,0.0,0.39775
194,2000887,0,13.87,4.200,,0.310,28.410,0.5705,2.4730,1.062,...,0.000004,4.700000e-09,4.600000e-10,0.000014,2.600000e-06,AMO,37799.0,2624,0.0,0.39776
195,2001036,0,9.25,37.675,,0.238,10.297,0.5331,2.6660,1.245,...,0.000002,2.900000e-09,2.400000e-10,0.000006,1.700000e-06,AMO,35482.0,7159,0.0,0.37075
196,2001221,0,17.39,1.000,,,,0.4358,1.9190,1.082,...,0.000008,1.300000e-09,2.700000e-10,0.000021,7.000000e-07,AMO,32775.0,573,0.0,0.45902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28047,54235471,1,20.70,,,,,0.5464,1.7620,0.799,...,0.510000,2.900000e-02,6.600000e-03,0.140000,1.300000e+01,APO,2.0,52,9.0,0.35793
28048,54235472,0,25.50,,,,,0.3402,0.8315,0.549,...,0.400000,1.600000e-03,2.800000e-03,0.250000,6.000000e-01,ATE,1.0,27,8.0,0.49332
28049,54235473,0,23.30,,,,,0.5449,1.4670,0.668,...,0.470000,1.700000e-02,6.400000e-03,0.180000,7.500000e+00,APO,1.0,27,9.0,0.20312
28050,54235474,0,24.51,,,,,0.6016,2.2050,0.878,...,0.160000,3.100000e-02,4.000000e-03,0.056000,1.600000e+01,APO,1.0,24,9.0,0.47646


In [33]:
# create different groups
g1 = ['spkid', 'pha', 'H', 'diameter(km)', 'extent(km)', 'albedo', 'rot_per']
g2 = ['spkid', 'pha', 'eccentricity', 'semi_major_axis(au)', 'q(au)', 'inclination(deg)', 'long_ascending_node(deg)', 'argument_perihelion(deg)', 'mean_anomaly(deg)', 'aphelion_distance(au)']
g3 = ['spkid', 'pha', 'sigma_e', 'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w', 'sigma_ma', 'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per',]

Looks like diameter, extent, albedo, and rot_per have too much NAN.  Will not use ds1 in RF.

In [34]:
ds1 = df[g1]
ds2 = df[g2]
ds3 = df[g3]




In [39]:
ds1.isnull().sum()

spkid               0
pha                 0
H                   4
diameter(km)    26579
extent(km)      27818
albedo          26624
rot_per         25945
dtype: int64

In [None]:
#ds1 = ds1.dropna()
ds2 = ds2.dropna()
ds3 = ds3.dropna()

In [35]:
ds1.info()
ds2.info()
ds3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27827 entries, 192 to 28051
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   spkid         27827 non-null  int64  
 1   pha           27827 non-null  int64  
 2   H             27823 non-null  float64
 3   diameter(km)  1248 non-null   float64
 4   extent(km)    9 non-null      object 
 5   albedo        1203 non-null   float64
 6   rot_per       1882 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 1.7+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 27827 entries, 192 to 28051
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   spkid                     27827 non-null  int64  
 1   pha                       27827 non-null  int64  
 2   eccentricity              27827 non-null  float64
 3   semi_major_axis(au)       27827 non-null  float64


In [40]:
# Saving cleaned data
#file_path = "ds1_cleaned.csv"
#ds1.to_csv(file_path, index=False)
# Saving cleaned data
file_path = "ds2_cleaned.csv"
ds2.to_csv(file_path, index=False)
# Saving cleaned data
file_path = "ds3_cleaned.csv"
ds3.to_csv(file_path, index=False)