In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
randomejections = pd.read_csv('randomejections.csv')
resonantejections = pd.read_csv('resonantejections.csv')

In [3]:
#lables for columns of Initial conditions and lables
col = ['p0m','p0x','p0y','p0z','p0vx','p0vy','p0vz','p1m','p1x','p1y','p1z','p1vx','p1vy','p1vz','p2m','p2x','p2y','p2z','p2vx','p2vy','p2vz','p3m','p3x','p3y','p3z','p3vx','p3vy','p3vz']
lab = ['runstring', 'instability_time',
       'shadow_instability_time', 'Stable']

In [4]:
#load path and data for random datasets
randomPath = 'csvs/random/'
randomInitial = pd.read_csv(randomPath+'initial_conditions.csv',header=None)
randomLables = pd.read_csv(randomPath+'labels.csv')
randomInitial.columns = col #adds lables to initial condition columns
randset = pd.DataFrame.join(randomInitial, randomLables) #joins initial conditions and lables

In [5]:
#load path and data for resonant datasets
resPath = 'csvs/resonant/'
resInitial = pd.read_csv(resPath+'initial_conditions.csv',header=None)
resLables = pd.read_csv(resPath+'labels.csv')
resInitial.columns = col #adds lables to initial condition columns
resset = pd.DataFrame.join(resInitial, resLables) #joins initial conditions and lables

In [6]:
#combines dataset with ejection data based on runstring
randset = pd.merge(randset,randomejections[['runstring','ejection']],on='runstring')
resset = pd.merge(resset,resonantejections[['runstring','ejection']],on='runstring')

In [8]:
#removes junk columns
randset = randset.drop('Unnamed: 0',axis=1)
resset = resset.drop('Unnamed: 0',axis=1)

In [9]:
#checking how many ejection systems exist
print('random:')
print(randset['ejection'].value_counts())
print('resonant:')
print(resset['ejection'].value_counts())

random:
ejection
False    24941
True        59
Name: count, dtype: int64
resonant:
ejection
False    113478
True         65
Name: count, dtype: int64


In [10]:
#finds duplicates and lables them, this will lable all duplicates, other than the first appereance
randset['isDup']=randset[col].duplicated()
resset['isDup']=resset[col].duplicated()

In [11]:
#checking how many duplicated systems in each dataset
print('random:')
print(randset['isDup'].value_counts())
print('resonant:')
print(resset['isDup'].value_counts())

random:
isDup
False    25000
Name: count, dtype: int64
resonant:
isDup
False    102559
True      10984
Name: count, dtype: int64


In [12]:
#labeling each row as to whether or not it should be removed
randset['remove']=(randset['ejection']==True) | (randset['isDup']==True)
resset['remove']=(resset['ejection']==True) | (resset['isDup']==True)


In [13]:
#determining how many total systems need to be dropped
print('random:')
print(randset['remove'].value_counts())
print('resonant:')
print(resset['remove'].value_counts())

random:
remove
False    24941
True        59
Name: count, dtype: int64
resonant:
remove
False    102497
True      11046
Name: count, dtype: int64


In [14]:
#removes the bad samples
randset = randset.drop(randset[randset['remove']==True].index)
resset = resset.drop(resset[resset['remove']==True].index)


In [15]:
#seperates lables from initial conditions
cleanrandinitial = randset[col+['runstring']]
cleanrandlables = randset[lab]
cleanresinitial = resset[col+['runstring']]
cleanreslables = resset[lab]

In [16]:
#saves clean data
cleanrandinitial.to_csv(randomPath+'clean_initial_conditions.csv')
cleanrandlables.to_csv(randomPath+'clean_labels.csv')
cleanresinitial.to_csv(resPath+'clean_initial_conditions.csv')
cleanreslables.to_csv(resPath+'clean_labels.csv')
