-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_files.py
433 lines (315 loc) · 12.6 KB
/
read_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
'''
Python Routines for Cosmology and Data I/O (PyRCODIO) v0.2
Edoardo Carlesi (2020)
ecarlesi83@gmail.com
read_files.py: read different file formats and types
'''
import tools as t
import scipy as sp
import halo_utils as hu
import pandas as pd
import numpy as np
import dask.dataframe as dd
import sys
sys.path.append('pygadgetreader/')
from pygadgetreader import *
def read_ahf_halo(file_name, file_mpi=True):
'''
This function assumes that the halo catalog format is AHF and is correcting accordingly.
This can be of course very easily modified
'''
halo = pd.read_csv(file_name, sep='\t')
halo.shift(1, axis=1)
# MPI produced files have a slightly different formatting
if file_mpi == True:
# Rearrange the columns, the first one is being read incorrectly so we need to split
halo['ID'] = halo['#ID(1)'].apply(lambda x: x.split()[0])
halo['HostHalo'] = halo['#ID(1)'].apply(lambda x: x.split()[1])
# There's a NaN unknown column being read at the end of the file
new_columns = halo.columns[2:].insert(len(halo.columns[2:-2]), '0')
# Now drop some useless stuff
halo.drop('#ID(1)', axis=1, inplace=True)
halo.columns = new_columns
halo.drop('0', axis=1, inplace=True)
else:
halo.rename(columns={"#ID(1)":"ID", "hostHalo(2)":"HostHalo"}, inplace=True)
# FIXME TODO should we return halos list? or only dataframe?
'''
halos = []
n_rows = halo.shape[0]
for i in range(0, n_rows):
#this_h = pd.DataFrame(data=halo.loc[i], columns=new_columns)
this_h = data=halo.loc[i]
h = hu.Halo(this_h)
halos.append(h)
'''
# halo is a DataFrame type
return halo
def read_rs_halo(read_file=None, header_file=None, with_dask=True):
'''
Read RockStar catalogs, possibly using dask
/srv/cosmdata/multidark/BigMD_3840_Planck1/ROCKSTAR/catalogs/
'''
if header_file == None:
rs_head = t.rs_header()
else:
with open(header_file, 'r') as f:
head = ''.join(f.readlines(1))
rs_head = head.split()
if with_dask == True:
rs_df = dd.read_csv(read_file, skiprows=16, header=None, delimiter=' ')
else:
rs_df = pd.read_csv(read_file, skiprows=16, header=None, delimiter=' ')
rs_df.columns = rs_head
return rs_df
def read_mah_halo(id_list, mah_path, time):
'''
Read directly the full MAH of every single halo once the AHF file is given.
We also need to pass the time data about a/z/t that will be read by each object.
'''
mah_format = '.allinfo'
all_mah = []
cols = None
n_cols = 0
head_count = 0
for ID in id_list:
file_mah = mah_path + str(ID) + mah_format
try:
# Read header. There is some issue with delimiters so we need to read this separately. Only one time
head = pd.read_csv(file_mah, sep='\s+', nrows=0)
cols = head.columns
n_cols = len(head.columns)
head_count = 1
break
except:
'This file does not exist'
#print('File: ', file_mah, ' does not exist.')
if head_count == 0:
print('There are no suitable MAH files in folder ', mah_path)
print('Please make sure there are enough files with the right format. Exiting...')
id_list = []
for ID in id_list:
file_mah = mah_path + str(ID) + mah_format
try:
# Read the rest of the file
this_mah = pd.read_csv(file_mah, sep='\t', skiprows=1, header=None)
def split_id(x):
x = x.split(' ')
new_x = list(filter(None, x))
return new_x[0]
def split_host(x):
x = x.split(' ')
new_x = list(filter(None, x))
return new_x[1]
new_mah = pd.DataFrame()
# For sure there is a smarter way to do this but I could not figure it out so far
new_mah['ID'] = this_mah.loc[:, 0].apply(lambda x: split_id(x))
new_mah['HostHalo'] = this_mah.loc[:, 0].apply(lambda x: split_host(x))
# Also this could be done more efficiently but let's not waste too much time on this part
for i in range(2, n_cols):
new_mah[cols[i]] = this_mah.loc[:, i-1]
halo_mah = hu.HaloHistory(time)
halo_mah.load_full_mah(new_mah)
all_mah.append(halo_mah)
except ValueError:
'The output file for this halo was not produced (too few particles to be traced, most likely). Do nothing'
pass
return all_mah
def read_csv_tree(file_name):
'''
This is sort of useless but allows to keep some simmetry in the program reading routines
'''
tree = pd.read_csv(file_name)
return tree
def read_time(data_path):
'''
Read a redshift file (containing the z info of each snapshot) and a tabulated file containing a(t) each 5 Myrs
'''
f_a2myr = data_path + 'output_list_5Myr.txt'
f_z = data_path + 'redshifts_sims.txt'
df_z = pd.read_csv(f_z, header = None)
df_a2myr = pd.read_csv(f_a2myr, header = None)
def z2a(z):
return (1.0 / (1.0 + z))
df_a = df_z.apply(lambda z: z2a(z))
n_t = df_a2myr.shape[0]
t2a = np.zeros((2, n_t))
T0 = 13.75 # Time after the big bang
step = 0.005 # 5 Megayears
t2a[1, :] = df_a2myr.values.T
d0 = abs(T0 - n_t * step)
for i in range(1, n_t+1):
t2a[0, i-1] = d0 + i * step
n_z = df_z.shape[0]
time = np.zeros((3, n_z))
for i in range(0, n_z):
time[0, i] = df_z.values[n_z -i -1]
time[1, i] = df_a.values[n_z -i -1]
time[2, :] = np.interp(time[1, :], t2a[1, :], t2a[0, :])
return time
def read_snap(file_name=None, velocity=False, ids=False, part_types=[1], n_files=1):
'''
Return the full particle content of a snapshot as a dataframe
'''
# Make this a list if it is not already a list
if isinstance(part_types, list) == False:
part_types = [part_types]
# Initialize the structure that will contain all the data to None
full_data = None
# Loop over snapshots and particle types
for f in range(0, n_files):
if n_files > 1:
this_file = file_name + '.' + str(f)
else:
this_file = file_name
if velocity == True:
cols = ['X', 'Y', 'Z', 'ID', 'Type', 'Vx', 'Vy', 'Vz']
elif velocity == False and ids == True:
cols = ['X', 'Y', 'Z', 'ID', 'Type']
elif velocity == False and ids == False:
cols = ['X', 'Y', 'Z', 'Type']
for part_type in part_types:
try:
# Read positions only
print('Reading positions from file: ', this_file)
particles = readsnap(this_file, 'pos', part_type)
# Add particle type info
ptype = np.zeros((len(particles), 1))
ptype.fill(part_type)
parts_types = np.concatenate((particles, ptype), axis=1)
if ids:
# Do we want to read the particle IDs?
print('Reading IDs from file: ', this_file)
pids = readsnap(this_file, 'pid', part_type)
# Reshape the IDs for concatenation!
pids = pids.reshape((len(pids), 1))
part_ids_type = np.concatenate((part_ids, parts_types), axis=1)
else:
# Use this other variable for consistency
part_ids_type = parts_types
# Do we want to read in velocity data as well
if velocity == True:
velocities = readsnap(this_file, 'vel', part_type)
print('Reading velocities from file: ', this_file, ' ptype = ', part_type)
part_ids_type_vel = np.concatenate((part_ids_type, velocities), axis=1)
if full_data == None:
full_data = part_ids_type_vel
else:
full_data = np.concatenate((full_data, part_ids_type_vel), axis=0)
# Reading only positions
else:
if full_data == None:
full_data = part_ids_type
else:
full_data = np.concatenate((full_data, part_ids_type), axis=0)
# Skip file
except:
# TODO
'This should be fixed'
print('There are no particles of type: ', part_type, ' in file: ', this_file)
try:
n_part = len(full_data)
print('Found ', n_part, ' particles in total, for type(s): ', part_types, ' ncols: ', len(cols))
part_df = pd.DataFrame(data=full_data, columns=cols)
except:
print('Warning! ', this_file, ' could not be exported to a DataFrame.')
part_df = pd.DataFrame()
# Return the selected particles' properties in a dataframe
return part_df
def extract_vweb(file_name=None, center=None, radius=None):
'''
Assuming vweb files are all of .csv type, we want to extract (and save) a subset of the original file around a specified point in space
'''
vweb = pd.read_csv(file_name)
new_key = 'radius'
print(vweb.head())
# Select these three columns
cols = ['x', 'y', 'z']
fac = t.check_units(data=vweb, cols=cols)
# Check that the units are consistent
vweb[new_key] = vweb[cols].T.apply(lambda x: t.distance(x, center))
#print(vweb.head())
select_vweb = vweb[vweb[new_key] < radius]
return select_vweb
def read_lg_lgf(TA=False):
'''
Read all the LGF / Hestia simulation LG data
'''
if TA == True:
data_file = '/home/edoardo/CLUES/PyRCODIO/output/lg_pairs_512_TA.csv'
else:
data_file = '/home/edoardo/CLUES/PyRCODIO/output/lg_pairs_512.csv'
data = pd.read_csv(data_file)
return data
def read_lg_fullbox(file_base='/home/edoardo/CLUES/PyRCODIO/output/lg_fullbox', TA=False):
'''
Read all the fullbox LG data for each box, without VWeb information
'''
if TA == True:
data_ta = file_base + '_TA.csv'
data = pd.read_csv(data_ta)
else:
data_00 = file_base + '_00.csv'
train_00 = pd.read_csv(data_00)
data_01 = file_base + '_01.csv'
train_01 = pd.read_csv(data_01)
data_02 = file_base + '_02.csv'
train_02 = pd.read_csv(data_02)
data_03 = file_base + '_03.csv'
train_03 = pd.read_csv(data_03)
data_04 = file_base + '_04.csv'
train_04 = pd.read_csv(data_04)
data = pd.concat([train_00, train_01, train_02, train_03, train_04])
return data
def read_lg_rs_fullbox(file_base='/home/edoardo/CLUES/PyRCODIO/output/lg_fullbox_rs', lgf_data=False, lgf_hires_data=False, files=[0, 20], TA=False):
'''
Read all the fullbox LG data for each box, without VWeb information
'''
all_data = []
for i in range(files[0], files[1]):
this_number = '%04d' % i
this_data_file = file_base + '_' + this_number + '.csv'
try:
this_data = pd.read_csv(this_data_file)
all_data.append(this_data)
except:
'Skip this data'
data = pd.concat(all_data)
return data
def read_lg_vweb(grid_size=64, file_base='/home/edoardo/CLUES/PyRCODIO/output/lg_fullbox_vweb_'):
'''
Read vweb ONLY at the LG position
'''
grid = '%03d' % grid_size
file_base = file_base + grid
data_00 = file_base + '_00.csv'
train_00 = pd.read_csv(data_00)
data_01 = file_base + '_01.csv'
train_01 = pd.read_csv(data_01)
data_02 = file_base + '_02.csv'
train_02 = pd.read_csv(data_02)
data_03 = file_base + '_03.csv'
train_03 = pd.read_csv(data_03)
data_04 = file_base + '_04.csv'
train_04 = pd.read_csv(data_04)
data = pd.concat([train_00, train_01, train_02, train_03, train_04])
return data
def read_lg_fullbox_vweb(grids = [64]):
'''
Read both vweb and lg data, concatenate the sets
grids is a list type
'''
# First read the full data for each LG
data = read_lg_fullbox()
# Read in the cosmic web at different scales
for grid in grids:
this_data_web = read_lg_vweb(grid_size = grid)
# Read all the columns, skip the first two that only contain IDs
these_cols = this_data_web.columns[2:]
for col in these_cols:
new_col = col + '_' + str(grid)
data[new_col] = this_data_web[col]
return data
if __name__ == '__main__':
''' Locally test some function '''
pass