## Data Preperation 

* Many of the columns are just repeated the values, changed the column data type to categorical data type in this way the the rows are just references to these values.

* Using the Parquet file type with PyArrow (or fast Parquet) . The parquet file encodes our data types and it is also lighter and much faster to load. Pandas will now use PyArrow in the backend  now the whole BHB data set can be loaded and manipulated easily.

PyArrow
```Shell
conda install -c conda-forge pyarrow
```

In [1]:
import os
import pandas as pd
from pathlib import Path

cpu_model = !sed '5!d' /proc/cpuinfo
n_cpus=os.cpu_count()

print(cpu_model, '\n CPUS:', n_cpus)

data_folder = Path(r'D:\iorio+22_Zenodo_repository_V2\iorio+22_Zenodo_repository_V2\data_from_simulations\simulation_F')
data_path = Path(data_folder,'BHBHm.csv')
df = pd.read_csv(Path(data_folder,'BHBHm.csv'))



["'sed' is not recognized as an internal or external command,", 'operable program or batch file.'] 
 CPUS: 16


In [2]:
df.columns

Index(['Mass_0', 'Mass_1', 'Z', 'alpha', 'q'], dtype='object')

### This  is innefficent as we don't actually need all of these columns

In [3]:
df=df[["Mass_0","Mass_1","Z","alpha"]]
df['q'] = df['Mass_1'] / df['Mass_0']
df
#add q column to the csv file
df.to_csv(data_path, index=False)


### Better yet to not load the extra columns at all

In [4]:
df=pd.read_csv(data_path, usecols = ['Mass_0','Mass_1','Z','alpha','q'])
df

Unnamed: 0,Mass_0,Mass_1,Z,alpha,q
0,17.806320,15.805690,0.0001,0.5,0.887645
1,33.248700,25.701310,0.0001,0.5,0.773002
2,24.536760,20.099740,0.0001,0.5,0.819168
3,34.415210,25.073250,0.0001,0.5,0.728551
4,18.258180,7.264607,0.0001,0.5,0.397882
...,...,...,...,...,...
159008,10.572550,10.394180,0.0300,3.0,0.983129
159009,9.384505,10.410270,0.0300,3.0,1.109304
159010,7.553980,6.708311,0.0300,5.0,0.888050
159011,10.572550,10.394180,0.0300,5.0,0.983129


Make the repetitive columns categorical

In [5]:
df['alpha'] = df.alpha.astype('category')
df['Z'] = df.Z.astype('category')
df.to_parquet('BHBHm.pq')

In [6]:
old_size=os.stat(data_path).st_size
new_size=os.stat('BHBHm.pq').st_size
print("file now", round((new_size/old_size)*100), '% of the original size')

file now 35 % of the original size


## Reading Parquet Files
```Python
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

pf = ParquetFile('BHNSm.pq') 
first_ten_rows = next(pf.iter_batches(batch_size = 30)) 
df = pa.Table.from_batches([first_ten_rows]).to_pandas() 
df
```


In [7]:
pd.read_parquet('BHBHm.pq')

Unnamed: 0,Mass_0,Mass_1,Z,alpha,q
0,17.806320,15.805690,0.0001,0.5,0.887645
1,33.248700,25.701310,0.0001,0.5,0.773002
2,24.536760,20.099740,0.0001,0.5,0.819168
3,34.415210,25.073250,0.0001,0.5,0.728551
4,18.258180,7.264607,0.0001,0.5,0.397882
...,...,...,...,...,...
159008,10.572550,10.394180,0.0300,3.0,0.983129
159009,9.384505,10.410270,0.0300,3.0,1.109304
159010,7.553980,6.708311,0.0300,5.0,0.888050
159011,10.572550,10.394180,0.0300,5.0,0.983129


In [8]:
def read_rows(file, nrows=10, skiprows=11):  
    from pyarrow.parquet import ParquetFile
    import pyarrow as pa 
    pf = ParquetFile(file) 
    batch_list=list(pf.iter_batches(batch_size = nrows))
    nbatch=len(batch_list)
    nskip=int(skiprows/nrows)
    df = pa.Table.from_batches([batch_list[nskip]]).to_pandas()
    #could write something to open up batches either side and make new df
    #not teh most efficent
    #rounds to closed batch
    return df

read_rows('BHBHm.pq')

Unnamed: 0,Mass_0,Mass_1,Z,alpha,q
0,36.20616,40.82686,0.0001,0.5,1.127622
1,24.29805,20.73639,0.0001,0.5,0.853418
2,14.61868,5.704515,0.0001,0.5,0.390221
3,14.78636,14.88673,0.0001,0.5,1.006788
4,37.78879,28.16973,0.0001,0.5,0.745452
5,30.89322,17.10746,0.0001,0.5,0.553761
6,24.28255,8.73544,0.0001,0.5,0.359741
7,42.18942,44.56454,0.0001,0.5,1.056297
8,18.72427,13.67951,0.0001,0.5,0.730576
9,29.18185,32.11718,0.0001,0.5,1.100588


In [9]:
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

pf = ParquetFile('BHBHm.pq') 
first_ten_rows = next(pf.iter_batches(batch_size = 90)) 
df = pa.Table.from_batches([first_ten_rows]).to_pandas() 
df

Unnamed: 0,Mass_0,Mass_1,Z,alpha,q
0,17.806320,15.805690,0.0001,0.5,0.887645
1,33.248700,25.701310,0.0001,0.5,0.773002
2,24.536760,20.099740,0.0001,0.5,0.819168
3,34.415210,25.073250,0.0001,0.5,0.728551
4,18.258180,7.264607,0.0001,0.5,0.397882
...,...,...,...,...,...
85,9.355383,6.320489,0.0001,0.5,0.675599
86,19.801610,18.253250,0.0001,0.5,0.921806
87,20.328270,21.918940,0.0001,0.5,1.078249
88,25.606950,15.864300,0.0001,0.5,0.619531


In [14]:
#spliting the file into 3 parts for train test and validation
from sklearn.model_selection import train_test_split
df=pd.read_parquet('BHBHm.pq')
train, test = train_test_split(df, test_size=0.2)
train, valid = train_test_split(train, test_size=0.2)
train.to_parquet('train.pq')
test.to_parquet('test.pq')
valid.to_parquet('valid.pq')
