# Comparing  old and new feature generation functions to ensure identical results

In [1]:
import spock
import random
import numpy as np
import rebound
import pandas as pd
from spock import simsetup
from spock import FeatureKlassifier
from spock import FeatureClassifier

We can load initial conditions from our system dataset as REBOUND simulations

we will look at a subset of the resonant systems to ensure compatibility

In [2]:
#specify the data path
datapath = '../cleanData/csvs/resonant/clean_initial_conditions.csv'
initial = pd.read_csv(datapath)
#drop junk column
initial = initial.drop('Unnamed: 0', axis = 1)
initial.head()

Unnamed: 0,p0m,p0x,p0y,p0z,p0vx,p0vy,p0vz,p1m,p1x,p1y,...,p2vy,p2vz,p3m,p3x,p3y,p3z,p3vx,p3vy,p3vz,runstring
0,0.999996,7e-06,-1.8e-05,1.269113e-07,5.9e-05,3.1e-05,-7.653712e-08,3.98492e-06,-0.98296,-0.183683,...,-0.437075,-0.002895,2.531702e-07,-1.288724,0.962201,-0.018973,-3.077432,-3.851732,-0.164377,0000000.bin
1,1.0,-2e-05,3e-06,-3.832419e-07,-1.5e-05,-9.2e-05,2.664138e-06,2.065557e-07,0.845257,-0.535312,...,5.61077,-0.157725,6.830431e-07,-1.327142,0.550229,-0.009861,-1.680953,-4.761961,-0.052477,0000001.bin
2,1.0,5.7e-05,-0.000215,-1.19784e-06,0.000394,0.000225,1.018178e-06,1.797281e-07,0.899352,-0.407481,...,-5.756639,0.019097,9.648477e-05,-0.208863,2.258035,0.009349,-4.143923,-0.430091,-0.016918,0000002.bin
3,0.999953,7.1e-05,4.5e-05,2.256372e-06,-0.000199,0.000297,-1.334874e-06,4.704418e-05,-0.819392,-0.685703,...,-0.179467,0.045955,1.615758e-05,-2.003066,-0.132781,-0.004192,0.29388,-4.423427,0.115038,0000003.bin
4,0.999996,-6.3e-05,1.3e-05,1.256863e-06,-4.5e-05,-0.000265,-3.323176e-06,3.514145e-06,0.865534,0.497927,...,0.668516,0.002231,4.6556e-05,1.292557,-0.324317,-0.026652,1.326032,5.277415,0.076018,0000004.bin


We can establish a function that, given a list of initial conditions, will return a rebound simulation

In [3]:
def get_sim(row, dataset):
    '''Given a row number, and a data sheet containing initial conditions, returns a corresponding simulation
    
        Arguments:
            row: what row the simulation you would like to create is on
                format of row is in order: 
                [index, 'p0m', 'p0x', 'p0y', 'p0z', 'p0vx', 'p0vy', 'p0vz', 'p1m', 'p1x', 'p1y',
                'p1z', 'p1vx', 'p1vy', 'p1vz', 'p2m', 'p2x', 'p2y', 'p2z', 'p2vx',
                'p2vy', 'p2vz', 'p3m', 'p3x', 'p3y', 'p3z', 'p3vx', 'p3vy', 'p3vz']

            dataset: what dataset contains your initial conditions

        return: returns a rebound simulation with the specified initial conditions'''
    try:
        data = dataset.loc[row]
        sim = rebound.Simulation()
        sim.G=4*np.pi**2
        sim.add(m=data['p0m'], x=data['p0x'], y=data['p0y'], z=data['p0z'], vx=data['p0vx'], vy=data['p0vy'], vz=data['p0vz'])
        sim.add(m=data['p1m'], x=data['p1x'], y=data['p1y'], z=data['p1z'], vx=data['p1vx'], vy=data['p1vy'], vz=data['p1vz'])
        sim.add(m=data['p2m'], x=data['p2x'], y=data['p2y'], z=data['p2z'], vx=data['p2vx'], vy=data['p2vy'], vz=data['p2vz'])
        sim.add(m=data['p3m'], x=data['p3x'], y=data['p3y'], z=data['p3z'], vx=data['p3vx'], vy=data['p3vy'], vz=data['p3vz'])
        #sets dt and com
        #simsetup.init_sim_parameters(sim)
        return sim
    except:
        print("Error reading initial condition {0}".format(row))
        return None

We can now randomly sample 1000 simulations to compare

In [4]:
#generates the indexes of the systems
systemNum = range(300)
#creates the set of simulations
simSet = list(map((lambda i: get_sim(i,initial)),systemNum))

We can note the column names and import the different feature generators

In [5]:
col = ['EMcrossnear', 'EMfracstdnear', 'EPstdnear', 'MMRstrengthnear', 'EMcrossfar', 'EMfracstdfar', 'EPstdfar', 'MMRstrengthfar', 'MEGNO', 'MEGNOstd']

In [6]:
new = FeatureKlassifier()
old = FeatureClassifier()

creates the list of all generated features using both methods

In [45]:
newMeth = pd.DataFrame(np.array(list(map((lambda sim: list(new.generate_features(sim)[0][0].values())),simSet))), columns=col)
oldMeth = pd.DataFrame(np.array(list(map((lambda sim: list(old.generate_features(sim)[0][0].values())),simSet))), columns=col)


In [7]:
def getList(features):
    '''Helper function which isolates the data list from the generate_features return'''
    return list(features[0][0].values())

In [7]:
old.generate_features(simSet)

[([OrderedDict([('EMcrossnear', 0.06023365324093462),
                ('EMfracstdnear', 0.015818436914592334),
                ('EPstdnear', 0.00030940893570459324),
                ('MMRstrengthnear', 0.5191721201603476),
                ('EMcrossfar', 0.5040626705572052),
                ('EMfracstdfar', 0.0007338252818456995),
                ('EPstdfar', 0.00011825979445139067),
                ('MMRstrengthfar', nan),
                ('MEGNO', 1.9913328889741977),
                ('MEGNOstd', 0.0038996509098413955)])],
  True),
 ([OrderedDict([('EMcrossnear', 0.08054734124108819),
                ('EMfracstdnear', 0.01575998713340777),
                ('EPstdnear', 0.00014882830175720716),
                ('MMRstrengthnear', 0.41161459802864325),
                ('EMcrossfar', 0.2405042738149778),
                ('EMfracstdfar', 0.008907315020013077),
                ('EPstdfar', 0.002190021227913514),
                ('MMRstrengthfar', 0.008154010872997957),
                ('ME

In [8]:
from multiprocessing import Pool


if __name__ == "__main__":  # confirms that the code is under main function
    
    with Pool() as p:
        #generates the data for all the systems with each method 
        newMeth = pd.DataFrame(np.array(list(list(map(getList,p.map(new.generate_features, simSet))))), columns=col)
        oldMeth = pd.DataFrame(np.array(list(list(map(getList,p.map(old.generate_features, simSet))))), columns=col)


KeyboardInterrupt: 

combining both data sheets we can create a column which tells you if a identical duplicate of each row

In [10]:
allData = pd.concat([newMeth,oldMeth])
allData['dup'] = pd.concat([tn,to]).duplicated(keep=False)

We can then count how many of the datasets have identical duplicates present, which includes having the same index

In [11]:
allData['dup'].value_counts()

Unnamed: 0,EMcrossnear,EMfracstdnear,EPstdnear,MMRstrengthnear,EMcrossfar,EMfracstdfar,EPstdfar,MMRstrengthfar,MEGNO,MEGNOstd,dup
0,0.060234,0.015818,0.000309,0.519172,0.504063,0.000734,0.000118,,1.991333,0.003900,True
1,0.080547,0.015760,0.000149,0.411615,0.240504,0.008907,0.002190,0.008154,1.992337,0.004400,True
2,0.129660,0.029638,0.001016,1.004665,1.001981,0.000605,0.001013,0.010432,1.976166,0.017616,True
3,0.406112,0.037044,0.000954,0.270161,0.427768,0.031321,0.003735,0.013724,2.002622,0.000693,True
4,0.059897,0.020438,0.001445,0.207053,0.257596,0.052279,0.001528,0.035104,1.815040,0.082661,True
...,...,...,...,...,...,...,...,...,...,...,...
95,0.248961,0.180636,0.010347,0.160100,0.313106,0.086311,0.001346,0.695376,1.951805,0.157091,True
96,0.251792,0.031922,0.000618,0.169558,0.283201,0.016227,0.000294,0.051882,2.132727,0.056544,True
97,0.138286,0.014832,0.004957,0.040738,0.586498,0.011137,0.002792,1.062896,2.402203,0.193406,True
98,0.168006,0.014622,0.000612,0.049621,0.212054,0.009267,0.001204,1.419807,1.847572,0.161116,True


This tells us that every simulation, has a set with identical features and index present, telling us that both feature generation methods generate identical results