In [0]:
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit

#additional imports
from pyspark.sql.functions import when 
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.linalg import Vectors, Vector
from pyspark.ml.functions import vector_to_array

In [0]:
spark = SparkSession(sc)
sqlc=SQLContext(sc)
print(sc.getConf().getAll())

In [0]:
#import table to dataframe
train_dir = '/FileStore/tables/train_data.csv'
test_dir = '/FileStore/tables/test_data.csv'

train_df = spark.read.csv(train_dir, header='true')
test_df = spark.read.csv(test_dir, header='true')
# train_df.show()
# train_df.cache()
# test_df.show()
# test_df.cache()

In [0]:
#function to show distinct values and their counts
def distinctncount(column, df):
  distinctDF = df.select(column).distinct()
  distinctDF.show()
  countDF = (df.groupBy(column).count())
  countDF.show()

In [0]:
#neo and pha are both catagorical with N and Y
#neo as shown has null values
distinctncount ('neo', train_df)
distinctncount ('pha', train_df)
distinctncount ('neo', test_df)
distinctncount ('pha', test_df)

In [0]:
#turn null of neo into string 'N' 
train_df = train_df.withColumn('neo', when(train_df.neo.isNull(),lit('N')).otherwise(train_df.neo))
test_df = test_df.withColumn('neo', when(test_df.neo.isNull(),lit('N')).otherwise(test_df.neo))

distinctncount ('neo', train_df)
distinctncount ('neo', test_df)

In [0]:
#Repalce N with 0, Y with 1
train_df = train_df.withColumn('neo', regexp_replace('neo', 'N', '0'))
train_df = train_df.withColumn('neo', regexp_replace('neo', 'Y', '1'))
train_df = train_df.withColumn('pha', regexp_replace('pha', 'N', '0'))
train_df = train_df.withColumn('pha', regexp_replace('pha', 'Y', '1'))

test_df = test_df.withColumn('neo', regexp_replace('neo', 'N', '0'))
test_df = test_df.withColumn('neo', regexp_replace('neo', 'Y', '1'))
test_df = test_df.withColumn('pha', regexp_replace('pha', 'N', '0'))
test_df = test_df.withColumn('pha', regexp_replace('pha', 'Y', '1'))

distinctncount ('neo', train_df)
distinctncount ('pha', train_df)
distinctncount ('neo', test_df)
distinctncount ('pha', test_df)

In [0]:
#class feature has 13 catagories, should turn into index and onehot
distinctncount ('class', train_df)
distinctncount ('class', test_df)

In [0]:
indexer = StringIndexer(inputCol = 'class', outputCol = 'class_index')
model = indexer.fit(train_df)
train_indexed = model.transform(train_df)
test_indexed = model.transform(test_df)

distinctDF = train_indexed.select('class','class_index').distinct()
distinctDF.show()
countDF = (train_indexed.groupBy('class','class_index').count())
countDF.show()

distinctDF = test_indexed.select('class','class_index').distinct()
distinctDF.show()
countDF = (test_indexed.groupBy('class','class_index').count())
countDF.show()

In [0]:
encoder = OneHotEncoder(inputCol = 'class_index', outputCol = 'class_binary')
model = encoder.fit(train_indexed)
train_encoded = model.transform(train_indexed)
test_encoded = model.transform(test_indexed)
distinctDF = train_encoded.select('class','class_index','class_binary').distinct()
distinctDF.show()
countDF = (train_encoded.groupBy('class','class_index','class_binary').count())
countDF.show()

distinctDF = test_encoded.select('class','class_index','class_binary').distinct()
distinctDF.show()
countDF = (test_encoded.groupBy('class','class_index','class_binary').count())
countDF.show()

In [0]:
#Schema change now that catagorical data are encoded
gen_notinclude = (column for column in train_encoded.columns if column not in {'class_binary'})
for column in gen_notinclude:
  train_encoded = train_encoded.withColumn(column,col(column).cast(DoubleType()))
train_encoded.printSchema()

gen_notinclude = (column for column in test_encoded.columns if column not in {'class_binary'})
for column in gen_notinclude:
   test_encoded = test_encoded.withColumn(column,col(column).cast(DoubleType()))
test_encoded.printSchema()

In [0]:
#drop remaining catagorical data
train_double = train_encoded.drop('id', 'spkid', 'full_name', 'pdes', 'name', 'prefix', 'orbit_id', 'class', 'equinox')
train_double.printSchema()

test_double = test_encoded.drop('id', 'spkid', 'full_name', 'pdes', 'name', 'prefix', 'orbit_id', 'class', 'equinox')
test_double.printSchema()

In [0]:
#creat list of column names with null remaining
# trainnull = []
# for column in train_double.columns:
#   if train_double.where(col(column).isNull()).count() > 0:
#     trainnull.append(column)
    
# testnull = []
# for column in test_double.columns:
#   if test_double.where(col(column).isNull()).count() > 0:
#     testnull.append(column)

In [0]:
#the query command above alone takes 7 minutes to run, here are the mannually copied results to save time
trainnull = ['H', 'diameter', 'albedo', 'diameter_sigma', 'ad', 'per', 'per_y', 'sigma_e', 'sigma_a', 'sigma_q', 'sigma_i', 'sigma_om', 'sigma_w', 'sigma_ma', 'sigma_ad', 'sigma_n', 'sigma_tp', 'sigma_per', 'rms']

testnull = ['H', 'diameter', 'albedo', 'diameter_sigma', 'ma', 'ad', 'per', 'sigma_ad', 'sigma_per']

In [0]:
#fill in remaining null with avg of repsective column
for column in trainnull:
  df_stats = train_double.select(_mean(col(column)).alias('mean')).collect()
  mean = df_stats[0]['mean']
  train_double = train_double.withColumn(column, when(col(column).isNull(),lit(mean)).otherwise(col(column)))

train_double.display()                             

neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,a,q,i,om,w,ma,ad,n,tp,tp_cal,per,per_y,moid,moid_ld,sigma_e,sigma_a,sigma_q,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,rms,class_index,class_binary
0.0,0.0,19.1,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.2395140902948033,2.467962162977906,1.8768504506302568,4.284708638278016,187.4435648528415,162.4326374957067,252.60662934088057,3.059073875325553,0.2542121643500829,2459422.955671756,20210727.4556718,1416.139943265003,3.8771798583572994,0.871481,339.15426077,1.6638e-06,2.8262e-07,3.9211000000000005e-06,2.1708000000000003e-05,0.00012328,0.0013828,0.0007827999999999999,3.5031e-07,4.3666e-08,0.0030714,0.00024325,0.54462,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,19.699,5.50656552569257,0.1305999602098697,0.4777416015804461,2457245.5,57245.0,20150811.0,0.2369724826300891,2.365728138761921,1.805115668491649,3.2149011772728504,58.35429368550654,290.4174436760314,345.8561643481903,2.926340609032194,0.2708674508386901,2457297.716815302,20151002.2168153,1329.063344027966,3.63877712259539,0.801128,311.77498376,0.0033089,0.015305,0.0056419,0.0244169999999999,0.57431,4.3121,2.1855,0.018932,0.0026285,8.4517,12.897,0.49686,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,16.4,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.2503428686832137,2.715705981368436,2.0358483554925,11.03984398082382,236.99168698795668,51.7185707883975,349.60064228534617,3.395563607244372,0.2202317605851152,2459047.7200634778,20200717.2200635,1634.641611380422,4.4754048223967695,1.04054,404.9469518,4.520900000000001e-08,4.5365e-08,1.28e-07,9.3045e-06,3.9713e-05,4.5834e-05,2.832e-05,5.6721e-08,5.518300000000001e-09,0.00012943,4.0959e-05,0.5393,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,19.048,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1392587161510286,2.5278053684836403,2.17578643818893,6.0638148322223,22.79156041768309,350.82965423869337,335.73666891094086,2.8798242987783484,0.2452384858901295,2459099.4376973235,20200906.937697303,1467.958826663468,4.01905222905809,1.1768,457.975256,0.0011692999999999,0.0040728,0.0064406,0.0507699999999999,0.050185,0.72613,0.29743,0.00464,0.0005927,1.4209,3.5478,0.51993,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,17.6,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1246940298255613,2.58183895862292,2.2598990545115965,4.448999521990394,58.92993336985667,351.3884285789563,53.224126541995325,2.9037788627342414,0.2375802577731892,2458776.4741190583,20191019.9741191,1515.277419825352,4.14860347659234,1.27057,494.4677269,1.2554e-07,4.8201e-08,3.3747e-07,1.3808e-05,7.825699999999999e-05,9.8129e-05,4.7211000000000006e-05,5.4212e-08,6.6532e-09,0.00020218,4.2434e-05,0.3956699999999999,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,16.7,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1806618239919973,2.6104014333183607,2.138801549023742,15.47269809537461,96.546695216918,327.7133701350686,166.9163030554445,3.08200131761298,0.2336916123727959,2458286.241146801,20180616.741146803,1540.491746129557,4.21763653971131,1.16784,454.4882928,5.7705e-08,3.2518e-08,1.5127e-07,6.938200000000001e-06,2.9595e-05,3.8224e-05,2.8154e-05,3.8392e-08,4.3666e-09,0.00011492,2.8785e-05,0.5758,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,16.2,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.2803519266124396,2.968650154672411,2.1363833643716843,1.088762837313048,210.8479264807232,164.188599159001,157.80261965459292,3.8009169449731375,0.192692824714768,2458181.56644662,20180304.0664466,1868.258460235284,5.11501289592138,1.13758,442.7120086,5.259e-08,2.2447e-08,1.6369e-07,5.3079e-06,0.00038975,0.00039002,1.7393e-05,2.874e-08,2.1855e-09,8.332700000000001e-05,2.119e-05,0.5865100000000001,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,16.4,3.72,0.032,0.395,2459000.5,59000.0,20200531.0,0.3004161866195726,3.086492018878315,2.159259856535146,4.163927679443685,156.1090193885375,157.4909471784951,260.5177115306226,4.013724181221484,0.1817633575581611,2459547.8176211407,20211129.317621104,1980.597216272296,5.422579647562751,1.14612,446.0355204,6.3937e-08,2.6732e-08,1.9487000000000002e-07,7.7623e-06,8.5094e-05,8.725e-05,1.369e-05,3.4763000000000005e-08,2.3614e-09,7.8789e-05,2.5731e-05,0.5997600000000001,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,17.3,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.083025890413696,2.702075993094219,2.4777337278020988,5.972178180651,196.5621634811578,308.4727983571404,131.3563659362969,2.9264182583863376,0.2219002213683049,2458408.5385989416,20181017.0385989,1622.3507925324689,4.44175439433941,1.4944,581.575648,6.2072e-08,2.4774e-08,1.6735e-07,8.8411e-06,6.2159e-05,8.614e-05,5.5782e-05,2.6831e-08,3.0518e-09,0.0002521,2.2312e-05,0.6060800000000001,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,16.8,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1199888576546521,3.092329712277861,2.721284602610101,8.578432268703732,345.3696139885428,10.45378067323169,149.54274031091308,3.46337482194562,0.1812489018314974,2458175.4316062066,20180225.9316062,1986.2189307755536,5.43797106304053,1.71697,668.1932149,2.5931e-07,7.9908e-08,7.67e-07,1.2977e-05,5.2549e-05,8.235200000000001e-05,5.7106000000000005e-05,8.9496e-08,7.025399999999999e-09,0.00029743,7.6988e-05,0.54791,0.0,"List(0, 12, List(0), List(1.0))"


In [0]:
for column in testnull:
  df_stats = test_double.select(_mean(col(column)).alias('mean')).collect()
  mean = df_stats[0]['mean']
  test_double = test_double.withColumn(column, when(col(column).isNull(),lit(mean)).otherwise(col(column)))
  
test_double.display()

neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,a,q,i,om,w,ma,ad,n,tp,tp_cal,per,per_y,moid,moid_ld,sigma_e,sigma_a,sigma_q,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,rms,class_index,class_binary
0.0,0.0,17.3,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.394707474345741,2.637647025504356,1.5965480298519752,5.004695090879287,22.958513566461875,337.5101088922193,59.22066112957665,3.6787460211567367,0.2300801004496381,2458743.108504543,20190916.6085045,1564.672474049097,4.28383976467925,0.595232,231.64643744,9.492700000000001e-08,1.2809e-08,2.5372e-07,8.1749e-06,3.8829e-05,4.7473e-05,1.2116e-05,1.7864e-08,1.6759e-09,5.3733e-05,1.1397e-05,0.38528,3.0,"List(0, 12, List(3), List(1.0))"
0.0,0.0,17.435,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.0244510067490147,2.91967629872945,2.8482872738442784,1.119575173823843,252.1273087617482,106.5839576931803,229.4527604343868,2.991065323614623,0.1975613646337749,2459661.293368216,20220322.7933682,1822.218634029697,4.988962721504991,1.84345,717.4154365,8.5786e-08,5.6696e-08,2.6902e-07,9.4507e-06,0.00042754,0.00046719,0.00018963,5.8082000000000004e-08,5.754600000000001e-09,0.00096511,5.3077e-05,0.6791699999999999,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,18.776,5.5058838156012735,0.1307359631602307,0.4849465707680456,2455475.5,55475.0,20101006.0,0.2191642554913477,2.646781905940084,2.066701920076755,3.344411492070774,80.64116355815419,311.7612798852942,347.88970824386803,3.226861891803412,0.2288900101150546,2455528.408782476,20101127.9087825,1572.807829485618,4.30611315396473,1.07392,417.9374464,0.0023279,0.0201259999999999,0.0203789999999999,0.039986,0.37189,1.8663,0.96014,0.024537,0.0026108,4.5858,17.94,0.6167699999999999,0.0,"List(0, 12, List(0), List(1.0))"
1.0,0.0,22.5,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.3096638933943817,1.5803128109916815,1.090946993158978,18.361139327382578,356.2645883781242,118.2265405845208,257.205268366673,2.069678628824386,0.496123138871587,2459207.69600353,20201224.1960035,725.6263048298981,1.98665654984229,0.20569,80.0483773,8.965700000000001e-07,2.677e-08,1.42e-06,4.8644e-05,1.2937e-05,0.00025814,0.00014685,3.5060000000000004e-08,1.2606e-08,0.00030083,1.8438e-05,0.58458,5.0,"List(0, 12, List(5), List(1.0))"
0.0,0.0,17.660999999999998,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.1804749212675224,2.6669689271247594,2.1856479199789898,14.71333501444207,157.8498912794679,9.869375690142006,148.6862136504624,3.1482899342705286,0.2262961369475819,2458343.4573493567,20180812.9573494,1590.835817420023,4.3554710949213495,1.19451,464.8674567,2.2527e-06,3.5519e-06,3.3495e-06,6.375699999999999e-05,4.595e-05,0.0023724,0.0026384,4.193e-06,4.5208e-07,0.010353,0.0031780999999999,0.5413100000000001,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,20.66,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.0924174389856314,1.5443905923885768,1.401661969046523,16.39801617517548,77.12056926463045,70.74379348392286,109.52484736461771,1.6871192157306318,0.5135329950220777,2458787.22286216,20191030.7228622,701.0260362813162,1.9193046852329,0.476956,185.61696652,2.465e-05,4.9805e-05,7.8434e-05,0.0020685,0.0023971,0.0077039,0.039567,5.4408000000000005e-05,2.4841e-05,0.066737,0.033911,0.49827,3.0,"List(0, 12, List(3), List(1.0))"
0.0,0.0,17.7,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.131410319092072,2.213022516004117,1.922208521018076,3.423528816595293,112.0897681312028,126.02034568244552,24.14609158093405,2.5038365109901584,0.2993817515997451,2458919.846815723,20200311.346815698,1202.4781005400012,3.29220561407256,0.912365,355.06508705,4.6759e-08,2.3425e-08,9.8544e-08,6.6887e-06,0.00010878,0.00011385,3.8012e-05,2.6503e-08,4.7534e-09,0.00012629999999999998,1.9092e-05,0.52547,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,16.4,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.074136548168741,2.7612253250676733,2.556517610751046,3.418225972700529,178.5508037953154,249.1070225166954,230.7398544229616,2.9659330393842995,0.2148084206272985,2459602.2461754973,20220122.7461755,1675.911954236724,4.58839686307111,1.57275,612.0671175,6.5856e-08,3.2257e-08,1.7197e-07,6.9182e-06,0.00010537,0.00011496,4.8482e-05,3.4649e-08,3.7642e-09,0.00022880000000000003,2.9368e-05,0.66067,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,20.721,5.5058838156012735,0.1307359631602307,0.4849465707680456,2457689.5,57689.0,20161028.0,0.2389541476819469,2.285148162355376,1.73910253089278,7.582981572774446,211.726834177505,158.82038412814768,16.04026757813257,2.831193793817972,0.2853202056232544,2457633.2815224365,20160901.7815224,1261.740293554096,3.4544566558633703,0.741343,288.50845531,0.0019465999999999,0.008532,0.0068069,0.0723559999999999,0.037691,0.73847,0.38824,0.0105709999999999,0.0015979,1.4509,7.0664,0.4429,0.0,"List(0, 12, List(0), List(1.0))"
0.0,0.0,17.8,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.1587987781945787,2.261088069558625,1.902030046722377,3.390419000123865,297.2163376317379,211.9277581136044,242.84934747114428,2.6201460923948727,0.289886410173017,2459404.626059096,20210709.1260591,1241.865735565652,3.40004308163081,0.914438,355.87183646,6.793700000000002e-08,1.4045000000000001e-08,1.5413e-07,6.0846e-06,8.4619e-05,8.7025e-05,1.7485999999999998e-05,1.6275e-08,2.701e-09,6.1344e-05,1.1571e-05,0.6408,0.0,"List(0, 12, List(0), List(1.0))"


In [0]:
#Additional features "diameter_est" and "psedo_target"
train_double.select('diameter','albedo','H').show()
test_double.select('diameter','albedo','H').show()

In [0]:
#estimated diameter based on JPL method. link: https://cneos.jpl.nasa.gov/tools/ast_size_est.html
train_new = train_double.withColumn('temp_albedo',3.1236 - (0.5 * log10(col('albedo'))))
train_new = train_new.withColumn('temp_H', 0.2 * col('H'))
train_new = train_new.withColumn('diameter_est', pow(10,(col('temp_albedo')-col('temp_H'))))
train_new.select('diameter_est').show()

test_new = test_double.withColumn('temp_albedo',3.1236 - (0.5 * log10(col('albedo'))))
test_new = test_new.withColumn('temp_H', 0.2 * col('H'))
test_new = test_new.withColumn('diameter_est', pow(10,(col('temp_albedo')-col('temp_H'))))
test_new.select('diameter_est').show()

In [0]:
'''
Among NEOs, the definition of “potentially hazardous
asteroids” (PHAs) is reserved for those objects with absolute
magnitudes H  22 (i.e., diameter above ∼140 m, assuming
the average NEO albedo of 0.14 found by Mainzer et al. 2011a)
and whose minimum orbit intersection distance (MOID) is
within 0.05 au (7.5 million km) of the Earthʼs orbit.
'''

#pseudo_target based on rough classification from literature
train_new = train_new.withColumn('pseudo_target', when((col('diameter_est') >=0.13) & (col('H') <=22) & (col('moid') <= 0.05), 1).otherwise(0))
distinctDF = train_new.select('pseudo_target','pha').distinct()
distinctDF.show()
countDF = (train_new.groupBy('pseudo_target','pha').count())
countDF.show()

test_new = test_new.withColumn('pseudo_target', when((col('diameter_est') >=0.13) & (col('H') <=22) & (col('moid') <= 0.05), 1).otherwise(0))
distinctDF = test_new.select('pseudo_target','pha').distinct()
distinctDF.show()
countDF = (test_new.groupBy('pseudo_target','pha').count())
countDF.show()

In [0]:
#drop unnecessary columns
train_new = train_new.drop(col('temp_albedo'))
train_clean_df = train_new.drop(col('temp_H'))

test_new = test_new.drop(col('temp_albedo'))
test_clean_df = test_new.drop(col('temp_H'))

In [0]:
#FINAL DF TO BE USED
train_clean_df.printSchema()
test_clean_df.printSchema()

In [0]:
train_clean_df.display()

neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,a,q,i,om,w,ma,ad,n,tp,tp_cal,per,per_y,moid,moid_ld,sigma_e,sigma_a,sigma_q,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,rms,class_index,class_binary,diameter_est,pseudo_target
0.0,0.0,19.1,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.2395140902948033,2.467962162977906,1.8768504506302568,4.284708638278016,187.4435648528415,162.4326374957067,252.60662934088057,3.059073875325553,0.2542121643500829,2459422.955671756,20210727.4556718,1416.139943265003,3.8771798583572994,0.871481,339.15426077,1.6638e-06,2.8262e-07,3.9211000000000005e-06,2.1708000000000003e-05,0.00012328,0.0013828,0.0007827999999999999,3.5031e-07,4.3666e-08,0.0030714,0.00024325,0.54462,0.0,"List(0, 12, List(0), List(1.0))",0.5567093016648205,0
0.0,0.0,19.699,5.50656552569257,0.1305999602098697,0.4777416015804461,2457245.5,57245.0,20150811.0,0.2369724826300891,2.365728138761921,1.805115668491649,3.2149011772728504,58.35429368550654,290.4174436760314,345.8561643481903,2.926340609032194,0.2708674508386901,2457297.716815302,20151002.2168153,1329.063344027966,3.63877712259539,0.801128,311.77498376,0.0033089,0.015305,0.0056419,0.0244169999999999,0.57431,4.3121,2.1855,0.018932,0.0026285,8.4517,12.897,0.49686,0.0,"List(0, 12, List(0), List(1.0))",0.4225017164895758,0
0.0,0.0,16.4,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.2503428686832137,2.715705981368436,2.0358483554925,11.03984398082382,236.99168698795668,51.7185707883975,349.60064228534617,3.395563607244372,0.2202317605851152,2459047.7200634778,20200717.2200635,1634.641611380422,4.4754048223967695,1.04054,404.9469518,4.520900000000001e-08,4.5365e-08,1.28e-07,9.3045e-06,3.9713e-05,4.5834e-05,2.832e-05,5.6721e-08,5.518300000000001e-09,0.00012943,4.0959e-05,0.5393,0.0,"List(0, 12, List(0), List(1.0))",1.930316298768884,0
0.0,0.0,19.048,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1392587161510286,2.5278053684836403,2.17578643818893,6.0638148322223,22.79156041768309,350.82965423869337,335.73666891094086,2.8798242987783484,0.2452384858901295,2459099.4376973235,20200906.937697303,1467.958826663468,4.01905222905809,1.1768,457.975256,0.0011692999999999,0.0040728,0.0064406,0.0507699999999999,0.050185,0.72613,0.29743,0.00464,0.0005927,1.4209,3.5478,0.51993,0.0,"List(0, 12, List(0), List(1.0))",0.5702016604906481,0
0.0,0.0,17.6,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1246940298255613,2.58183895862292,2.2598990545115965,4.448999521990394,58.92993336985667,351.3884285789563,53.224126541995325,2.9037788627342414,0.2375802577731892,2458776.4741190583,20191019.9741191,1515.277419825352,4.14860347659234,1.27057,494.4677269,1.2554e-07,4.8201e-08,3.3747e-07,1.3808e-05,7.825699999999999e-05,9.8129e-05,4.7211000000000006e-05,5.4212e-08,6.6532e-09,0.00020218,4.2434e-05,0.3956699999999999,0.0,"List(0, 12, List(0), List(1.0))",1.1107810900044577,0
0.0,0.0,16.7,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1806618239919973,2.6104014333183607,2.138801549023742,15.47269809537461,96.546695216918,327.7133701350686,166.9163030554445,3.08200131761298,0.2336916123727959,2458286.241146801,20180616.741146803,1540.491746129557,4.21763653971131,1.16784,454.4882928,5.7705e-08,3.2518e-08,1.5127e-07,6.938200000000001e-06,2.9595e-05,3.8224e-05,2.8154e-05,3.8392e-08,4.3666e-09,0.00011492,2.8785e-05,0.5758,0.0,"List(0, 12, List(0), List(1.0))",1.6812352133264812,0
0.0,0.0,16.2,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.2803519266124396,2.968650154672411,2.1363833643716843,1.088762837313048,210.8479264807232,164.188599159001,157.80261965459292,3.8009169449731375,0.192692824714768,2458181.56644662,20180304.0664466,1868.258460235284,5.11501289592138,1.13758,442.7120086,5.259e-08,2.2447e-08,1.6369e-07,5.3079e-06,0.00038975,0.00039002,1.7393e-05,2.874e-08,2.1855e-09,8.332700000000001e-05,2.119e-05,0.5865100000000001,0.0,"List(0, 12, List(0), List(1.0))",2.1165497332598933,0
0.0,0.0,16.4,3.72,0.032,0.395,2459000.5,59000.0,20200531.0,0.3004161866195726,3.086492018878315,2.159259856535146,4.163927679443685,156.1090193885375,157.4909471784951,260.5177115306226,4.013724181221484,0.1817633575581611,2459547.8176211407,20211129.317621104,1980.597216272296,5.422579647562751,1.14612,446.0355204,6.3937e-08,2.6732e-08,1.9487000000000002e-07,7.7623e-06,8.5094e-05,8.725e-05,1.369e-05,3.4763000000000005e-08,2.3614e-09,7.8789e-05,2.5731e-05,0.5997600000000001,0.0,"List(0, 12, List(0), List(1.0))",3.899644439213334,0
0.0,0.0,17.3,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.083025890413696,2.702075993094219,2.4777337278020988,5.972178180651,196.5621634811578,308.4727983571404,131.3563659362969,2.9264182583863376,0.2219002213683049,2458408.5385989416,20181017.0385989,1622.3507925324689,4.44175439433941,1.4944,581.575648,6.2072e-08,2.4774e-08,1.6735e-07,8.8411e-06,6.2159e-05,8.614e-05,5.5782e-05,2.6831e-08,3.0518e-09,0.0002521,2.2312e-05,0.6060800000000001,0.0,"List(0, 12, List(0), List(1.0))",1.275347331178873,0
0.0,0.0,16.8,5.50656552569257,0.1305999602098697,0.4777416015804461,2459000.5,59000.0,20200531.0,0.1199888576546521,3.092329712277861,2.721284602610101,8.578432268703732,345.3696139885428,10.45378067323169,149.54274031091308,3.46337482194562,0.1812489018314974,2458175.4316062066,20180225.9316062,1986.2189307755536,5.43797106304053,1.71697,668.1932149,2.5931e-07,7.9908e-08,7.67e-07,1.2977e-05,5.2549e-05,8.235200000000001e-05,5.7106000000000005e-05,8.9496e-08,7.025399999999999e-09,0.00029743,7.6988e-05,0.54791,0.0,"List(0, 12, List(0), List(1.0))",1.6055671640849551,0


In [0]:
test_clean_df.display()

neo,pha,H,diameter,albedo,diameter_sigma,epoch,epoch_mjd,epoch_cal,e,a,q,i,om,w,ma,ad,n,tp,tp_cal,per,per_y,moid,moid_ld,sigma_e,sigma_a,sigma_q,sigma_i,sigma_om,sigma_w,sigma_ma,sigma_ad,sigma_n,sigma_tp,sigma_per,rms,class_index,class_binary,diameter_est,pseudo_target
0.0,0.0,17.3,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.394707474345741,2.637647025504356,1.5965480298519752,5.004695090879287,22.958513566461875,337.5101088922193,59.22066112957665,3.6787460211567367,0.2300801004496381,2458743.108504543,20190916.6085045,1564.672474049097,4.28383976467925,0.595232,231.64643744,9.492700000000001e-08,1.2809e-08,2.5372e-07,8.1749e-06,3.8829e-05,4.7473e-05,1.2116e-05,1.7864e-08,1.6759e-09,5.3733e-05,1.1397e-05,0.38528,3.0,"List(0, 12, List(3), List(1.0))",1.274683794808411,0
0.0,0.0,17.435,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.0244510067490147,2.91967629872945,2.8482872738442784,1.119575173823843,252.1273087617482,106.5839576931803,229.4527604343868,2.991065323614623,0.1975613646337749,2459661.293368216,20220322.7933682,1822.218634029697,4.988962721504991,1.84345,717.4154365,8.5786e-08,5.6696e-08,2.6902e-07,9.4507e-06,0.00042754,0.00046719,0.00018963,5.8082000000000004e-08,5.754600000000001e-09,0.00096511,5.3077e-05,0.6791699999999999,0.0,"List(0, 12, List(0), List(1.0))",1.1978500755804566,0
0.0,0.0,18.776,5.5058838156012735,0.1307359631602307,0.4849465707680456,2455475.5,55475.0,20101006.0,0.2191642554913477,2.646781905940084,2.066701920076755,3.344411492070774,80.64116355815419,311.7612798852942,347.88970824386803,3.226861891803412,0.2288900101150546,2455528.408782476,20101127.9087825,1572.807829485618,4.30611315396473,1.07392,417.9374464,0.0023279,0.0201259999999999,0.0203789999999999,0.039986,0.37189,1.8663,0.96014,0.024537,0.0026108,4.5858,17.94,0.6167699999999999,0.0,"List(0, 12, List(0), List(1.0))",0.645955298041903,0
1.0,0.0,22.5,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.3096638933943817,1.5803128109916815,1.090946993158978,18.361139327382578,356.2645883781242,118.2265405845208,257.205268366673,2.069678628824386,0.496123138871587,2459207.69600353,20201224.1960035,725.6263048298981,1.98665654984229,0.20569,80.0483773,8.965700000000001e-07,2.677e-08,1.42e-06,4.8644e-05,1.2937e-05,0.00025814,0.00014685,3.5060000000000004e-08,1.2606e-08,0.00030083,1.8438e-05,0.58458,5.0,"List(0, 12, List(5), List(1.0))",0.1162525437616596,0
0.0,0.0,17.660999999999998,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.1804749212675224,2.6669689271247594,2.1856479199789898,14.71333501444207,157.8498912794679,9.869375690142006,148.6862136504624,3.1482899342705286,0.2262961369475819,2458343.4573493567,20180812.9573494,1590.835817420023,4.3554710949213495,1.19451,464.8674567,2.2527e-06,3.5519e-06,3.3495e-06,6.375699999999999e-05,4.595e-05,0.0023724,0.0026384,4.193e-06,4.5208e-07,0.010353,0.0031780999999999,0.5413100000000001,0.0,"List(0, 12, List(0), List(1.0))",1.0794498358221016,0
0.0,0.0,20.66,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.0924174389856314,1.5443905923885768,1.401661969046523,16.39801617517548,77.12056926463045,70.74379348392286,109.52484736461771,1.6871192157306318,0.5135329950220777,2458787.22286216,20191030.7228622,701.0260362813162,1.9193046852329,0.476956,185.61696652,2.465e-05,4.9805e-05,7.8434e-05,0.0020685,0.0023971,0.0077039,0.039567,5.4408000000000005e-05,2.4841e-05,0.066737,0.033911,0.49827,3.0,"List(0, 12, List(3), List(1.0))",0.2712704355013194,0
0.0,0.0,17.7,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.131410319092072,2.213022516004117,1.922208521018076,3.423528816595293,112.0897681312028,126.02034568244552,24.14609158093405,2.5038365109901584,0.2993817515997451,2458919.846815723,20200311.346815698,1202.4781005400012,3.29220561407256,0.912365,355.06508705,4.6759e-08,2.3425e-08,9.8544e-08,6.6887e-06,0.00010878,0.00011385,3.8012e-05,2.6503e-08,4.7534e-09,0.00012629999999999998,1.9092e-05,0.52547,0.0,"List(0, 12, List(0), List(1.0))",1.0602358001333083,0
0.0,0.0,16.4,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.074136548168741,2.7612253250676733,2.556517610751046,3.418225972700529,178.5508037953154,249.1070225166954,230.7398544229616,2.9659330393842995,0.2148084206272985,2459602.2461754973,20220122.7461755,1675.911954236724,4.58839686307111,1.57275,612.0671175,6.5856e-08,3.2257e-08,1.7197e-07,6.9182e-06,0.00010537,0.00011496,4.8482e-05,3.4649e-08,3.7642e-09,0.00022880000000000003,2.9368e-05,0.66067,0.0,"List(0, 12, List(0), List(1.0))",1.9293119958316247,0
0.0,0.0,20.721,5.5058838156012735,0.1307359631602307,0.4849465707680456,2457689.5,57689.0,20161028.0,0.2389541476819469,2.285148162355376,1.73910253089278,7.582981572774446,211.726834177505,158.82038412814768,16.04026757813257,2.831193793817972,0.2853202056232544,2457633.2815224365,20160901.7815224,1261.740293554096,3.4544566558633703,0.741343,288.50845531,0.0019465999999999,0.008532,0.0068069,0.0723559999999999,0.037691,0.73847,0.38824,0.0105709999999999,0.0015979,1.4509,7.0664,0.4429,0.0,"List(0, 12, List(0), List(1.0))",0.2637560708948209,0
0.0,0.0,17.8,5.5058838156012735,0.1307359631602307,0.4849465707680456,2459000.5,59000.0,20200531.0,0.1587987781945787,2.261088069558625,1.902030046722377,3.390419000123865,297.2163376317379,211.9277581136044,242.84934747114428,2.6201460923948727,0.289886410173017,2459404.626059096,20210709.1260591,1241.865735565652,3.40004308163081,0.914438,355.87183646,6.793700000000002e-08,1.4045000000000001e-08,1.5413e-07,6.0846e-06,8.4619e-05,8.7025e-05,1.7485999999999998e-05,1.6275e-08,2.701e-09,6.1344e-05,1.1571e-05,0.6408,0.0,"List(0, 12, List(0), List(1.0))",1.0125173285618134,0


In [0]:
# train_clean_df.write.format("orc").save("savetable/train_clean.orc")
# test_clean_df.write.format("orc").save("savetable/test_clean.orc")