In [1]:
import spock
import random
import numpy as np
import rebound
import pandas as pd
from spock import simsetup
from spock import FeatureClassifier

In [9]:
#specify the data path
#We will be using cleaned data generated from the original spock initial conditions data, filtered according to https://github.com/Ethadhani/SPOCKcleanData.git
datapath = 'cleanData/csvs/resonant/'
initial = pd.read_csv(datapath+'clean_initial_conditions.csv')
labels = pd.read_csv(datapath+'clean_labels.csv')
#drop junk column
initial = initial.drop('Unnamed: 0', axis = 1)
#merge labels and initial conditions based on runstring
Initialdataset = initial.set_index('runstring').join(labels.set_index('runstring'))
Initialdataset = Initialdataset.drop('Unnamed: 0', axis = 1)

We can establish a function that, given a list of initial conditions, will return a rebound simulation

In [10]:
def get_sim(row, dataset):
    '''Given a row number, and a data sheet containing initial conditions, returns a corresponding simulation
    
        Arguments:
            row: what row the simulation you would like to create is on
                format of row is in order: 
                [index, 'p0m', 'p0x', 'p0y', 'p0z', 'p0vx', 'p0vy', 'p0vz', 'p1m', 'p1x', 'p1y',
                'p1z', 'p1vx', 'p1vy', 'p1vz', 'p2m', 'p2x', 'p2y', 'p2z', 'p2vx',
                'p2vy', 'p2vz', 'p3m', 'p3x', 'p3y', 'p3z', 'p3vx', 'p3vy', 'p3vz']

            dataset: what dataset contains your initial conditions

        return: returns a rebound simulation with the specified initial conditions'''
    try:
        data = dataset.loc[row]
        sim = rebound.Simulation()
        sim.G=4*np.pi**2
        sim.add(m=data['p0m'], x=data['p0x'], y=data['p0y'], z=data['p0z'], vx=data['p0vx'], vy=data['p0vy'], vz=data['p0vz'])
        sim.add(m=data['p1m'], x=data['p1x'], y=data['p1y'], z=data['p1z'], vx=data['p1vx'], vy=data['p1vy'], vz=data['p1vz'])
        sim.add(m=data['p2m'], x=data['p2x'], y=data['p2y'], z=data['p2z'], vx=data['p2vx'], vy=data['p2vy'], vz=data['p2vz'])
        sim.add(m=data['p3m'], x=data['p3x'], y=data['p3y'], z=data['p3z'], vx=data['p3vx'], vy=data['p3vy'], vz=data['p3vz'])
        return sim
    except:
        print("Error reading initial condition {0}".format(row))
        return None

We can now generate the set of all simulations

In [11]:
#generates the indexes of the systems
systemNum = range(Initialdataset.shape[0])
#creates the set of simulations
simSet = list(map((lambda i: get_sim(i,initial)),systemNum))

We can note the column names and import the different feature generators

In [12]:
col = ['EMcrossnear', 'EMfracstdnear', 'EPstdnear', 'MMRstrengthnear', 'EMcrossfar', 'EMfracstdfar', 'EPstdfar', 'MMRstrengthfar', 'MEGNO', 'MEGNOstd']

In [13]:
spock = FeatureClassifier()

In [14]:
def getList(features):
    '''Helper function which isolates the data list from the generate_features return'''
    return list(features[0][0].values())

We need to use a external python file in order to make multiprocessing work with jupyter notebook

we can then create the list of all generated features using both methods

In [16]:
import sys
from multiprocessing import Pool
sys.path.append('spockUpdate/train_models')
#import is required to get around jupyter notebook bug with multiprocessing
import genNewTrainHelper as helper
if __name__ == "__main__":
    with Pool() as pool:
        features = pool.map(helper.getSpock,simSet)
        pool.close()
        pool.join()
#formats the data correctly
formattedFeat = pd.DataFrame(np.array(list(map(getList,features))), columns = col)


We can then join the generated features with the corresponding labels

In [17]:
dataset = pd.DataFrame.join(formattedFeat,labels)

In [23]:
dataset

Unnamed: 0.1,EMcrossnear,EMfracstdnear,EPstdnear,MMRstrengthnear,EMcrossfar,EMfracstdfar,EPstdfar,MMRstrengthfar,MEGNO,MEGNOstd,Unnamed: 0,runstring,instability_time,shadow_instability_time,Stable
0,0.060234,0.030043,0.000283,0.491222,0.504063,0.000882,0.000330,,1.991877,0.003666,0,0000000.bin,1.545872e+06,3.063700e+06,False
1,0.080547,0.017239,0.000138,0.432211,0.240504,0.008924,0.002141,0.008266,1.994730,0.003807,1,0000001.bin,9.990000e+08,9.990000e+08,True
2,0.129660,0.028785,0.000989,1.000486,1.001981,0.000513,0.000936,0.010448,1.975245,0.015875,2,0000002.bin,9.990000e+08,9.990000e+08,True
3,0.406112,0.038118,0.000942,0.348009,0.427768,0.035020,0.003593,0.012846,2.002438,0.000682,3,0000003.bin,2.287671e+06,8.392234e+06,False
4,0.059897,0.026167,0.001919,0.285146,0.257596,0.052339,0.001386,0.032904,1.805866,0.080916,4,0000004.bin,9.668931e+05,3.380350e+05,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102492,0.089252,0.274872,0.011063,0.050574,0.310694,0.033604,0.000987,1.041204,75.482329,20.204394,113537,9044761.bin,6.303165e+04,6.470086e+04,False
102493,0.082222,0.049940,0.016289,0.614960,0.664222,0.031330,0.003745,,1.998822,0.003453,113538,9045377.bin,6.990387e+05,8.267916e+05,False
102494,0.131799,0.027696,0.001457,0.660450,0.366664,0.067480,0.020412,0.005440,7.135916,1.918209,113540,9045380.bin,1.193822e+07,3.363291e+07,False
102495,0.209454,0.090838,0.010214,1.438882,0.395073,0.102280,0.038515,0.041145,2.117164,0.088393,113541,9045382.bin,2.064407e+08,4.316851e+07,False


We can then save the new training data spreadsheet

In [20]:
dataset.to_csv(datapath+'CleanResTrainingData.csv')