In [2]:
import numpy as np
import pandas as pd

In [3]:
with open('N450M37rho15h120B57_00000','rb') as f:
    b = f.read()
    
np_data = np.frombuffer(b, dtype=float, count=-1)
df = pd.DataFrame(np_data)
df

Unnamed: 0,0
0,1.289516e-309
1,6.087800e+04
2,2.122026e-314
3,5.092824e-313
4,3.048712e+78
...,...
10926408,4.417390e-143
10926409,3.097845e-140
10926410,1.512415e-195
10926411,-2.617107e-131


In [4]:
%load_ext cython

In [6]:
%%cython --cplus
# or optionally %%cython --cplus --annotate which will show information about how
# the code is compiled to C/C++ binary

from cpython cimport Py_buffer
from cpython.buffer cimport PyBUF_SIMPLE, PyBUF_WRITEABLE
from libcpp.vector cimport vector


cdef class SimplestBuffer:
    cdef:
        vector[char] buf   # We're using vector from C++ to manage memory allocation
     
    # in cython, methods defined with 'def' are slower but accessible from python
    # extend will add bytes to our internal memory
    def extend(self, input_bytes):
        self.add_bytes(input_bytes, len(input_bytes))   
    
    # methods defined with 'cdef' may be faster but are accessible only from cython
    cdef add_bytes(self, char *b, int num_bytes):  
        self.buf.insert(self.buf.end(), b, b + num_bytes)
    
    def __getbuffer__(self, Py_buffer *buffer, int flags):
        # if the requested buffer type is not PyBUF_SIMPLE then error out
        # we will allow either readonly or writeable buffers however
        if flags != PyBUF_SIMPLE and flags != PyBUF_SIMPLE | PyBUF_WRITEABLE:
            raise BufferError
            
        buffer.buf = &self.buf[0]            # points to our buffer memory
        buffer.format = NULL                 # NULL format means bytes 
        buffer.internal = NULL               # this is for our own use if needed
        buffer.itemsize = 1                  # size in bytes of a single element   
        buffer.len = self.buf.size()
        buffer.ndim = 1
        buffer.obj = self
        buffer.readonly = not (flags & PyBUF_WRITEABLE)
        buffer.shape = NULL                  # none of shapes, strides or suboffsets
        buffer.strides = NULL                # are used with PyBUF_SIMPLE
        buffer.suboffsets = NULL    

    # the buffer protocol requires this method
    def __releasebuffer__(self, Py_buffer *buffer):
        pass       

In [7]:
with open('N450M37rho15h120B57_00000','rb') as f:
    b = f.read()

sb = SimplestBuffer()    
sb.extend(b)    #we implemented this method above. It fills the buffer with the bytes b
df = pd.DataFrame(np.frombuffer(sb, np.dtype('<i4')))
df

Unnamed: 0,0
0,24
1,60769
2,0
3,1089321408
4,60878
...,...
21852821,394019710
21852822,-1532647622
21852823,-1529703543
21852824,-1537054759


In [8]:
%%cython --cplus

from cpython cimport Py_buffer
from cpython.buffer cimport PyBUF_SIMPLE, PyBUF_WRITEABLE
from libcpp.vector cimport vector
from libc.stdint cimport uint8_t, uint16_t
from libc.string cimport memcpy, strlen
from cython.operator cimport dereference as deref

# this is the same as the class above but we need to repeat it here
cdef class SimplestBuffer:
    cdef:
        vector[char] buf   # We're using vector from C++ to manage memory allocation
     
    # in cython, methods defined with 'def' are slower but accessible from python
    # extend will add bytes to our internal memory
    def extend(self, input_bytes):
        self.add_bytes(input_bytes, len(input_bytes))   
    
    # methods defined with 'cdef' may be faster but are accessible only from cython
    cdef add_bytes(self, char *b, int num_bytes):  
        self.buf.insert(self.buf.end(), b, b + num_bytes)
    
    def __getbuffer__(self, Py_buffer *buffer, int flags):
        # if the requested buffer type is not PyBUF_SIMPLE then error out
        # we will allow either readonly or writeable buffers however
        if flags != PyBUF_SIMPLE and flags != PyBUF_SIMPLE | PyBUF_WRITEABLE:
            raise BufferError
            
        buffer.buf = &self.buf[0]            # points to our buffer memory
        buffer.format = NULL                 # NULL format means bytes 
        buffer.internal = NULL               # this is for our own use if needed
        buffer.itemsize = 1                  # size in bytes of a single element   
        buffer.len = self.buf.size()
        buffer.ndim = 1
        buffer.obj = self
        buffer.readonly = not (flags & PyBUF_WRITEABLE)
        buffer.shape = NULL                  # none of shapes, strides or suboffsets
        buffer.strides = NULL                # are used with PyBUF_SIMPLE
        buffer.suboffsets = NULL    

    # the buffer protocol requires this method
    def __releasebuffer__(self, Py_buffer *buffer):
        pass       
    
# this function walks through the input_bytes and copies each record into one 
# or the other of two buffers.  ( depending on the value of msg_type )
def fan_bytes(bytes input_bytes, SimplestBuffer buf1, SimplestBuffer buf2):
    cdef int num_bytes = len(input_bytes)
    cdef char *b = <char *>input_bytes                # you can cast bytes objects to char *
    cdef int cursor = 0
    cdef uint16_t msg_type
    cdef uint16_t body_len
    
    # here we step through the character array by doing some C/C++ pointer arithmetic...
    while cursor < num_bytes:
        body_len = deref(<uint16_t*>(b + cursor)) 
        msg_type = deref(<uint16_t*>(b + cursor + 2))
        
        # copy bytes into either buf1 or buf2 depending on msg_type.  
        if msg_type == 1:
            buf1.add_bytes(b + cursor, body_len + 4)  # body_len + 4 is our total record length
        elif msg_type == 2:
            buf2.add_bytes(b + cursor, body_len + 4)
            
        cursor += body_len + 4                        # move forward to start of next record 
       

In [9]:
with open('N450M37rho15h120B57_00000','rb') as f:
    b = f.read()
    
#create two buffers
sb1 = SimplestBuffer()
sb2 = SimplestBuffer()
fan_bytes(b, sb1, sb2)

#dt
dt1 = np.dtype('S5')
dt2 = np.dtype('<i4')

df1 = pd.DataFrame(np.frombuffer(sb1, dt1))
df2 = pd.DataFrame(np.frombuffer(sb2, dt2))

#and display them
from IPython.display import display
display(df1)
display(df2)

Unnamed: 0,0


Unnamed: 0,0


In [None]:
# to open a large file to only a selected line to avoid crash 

chunksize = 10 ** 6  # number of rows per chunk
for chunk in pd.read_csv('N450M37rho15h120B57_00000', chunksize=chunksize):
    print(chunk) 

In [None]:
# to delete header lines 

import csv, pandas as pd

with open('N450M37rho15h120B57_00000') as unsan:
    line_count = 0
    headingrow = 0
    datarow = 0
    safety = 1

    row_count = sum(1 for row in csv.reader(unsan, delimiter=','))

    for row in csv.reader(unsan, delimiter=','):

        #Detect data start
        if "DATA START" in str(row):
            safety = 0
            headingrow = line_count + 1
            datarow = line_count + 3

        #Write data
        if safety == 0:
            if line_count == headingrow:
                colnames = row
                line_count +=1
                break

        line_count += 1

badrows = [*range(0, 12, 1),row_count - 1]
colnames = ['x', 'y [pc]', 'z [pc]', 'particle mass', 'density', 'B_x', 
            'B_y', 'B_z', 'v_x', 'v_y', 'v_z', 'pressure', 'alpha', 'div v', 'poten', 
            'dt', 'psi', 'div B', 'J_x', 'J_y', 'J_z', 'itype']
df = pd.read_csv('N450M37rho15h120B57_00000', names=[*colnames], skiprows=[*badrows], na_filter=False)

df.to_csv (r'output', index = None, header=True)

In [None]:
# another way to delete header lines of a file

import io

class HeaderSkipCsv(io.TextIOBase):
    def __init__(self, filename):
        """ create an iterator from the filename """
        self.data = self.yield_csv('N450M37rho15h120B57_00000')

    def readable(self):
        """ here for compatibility """
        return True

    def yield_csv(self, filename):
        """ open filename and read past the first empty line
        Then yield characters one by one. This reads just one
        line at a time in memory
        """
        with open('N450M37rho15h120B57_00000') as f:
            for line in f:
                if line.strip() == "":
                    break
            for line in f:
                for char in line:
                    yield char

    def read(self, n=None):
        """ called by Pandas with some 'n', this returns
        the next 'n' characters since the last read as a string
        """
        data = ""
        for i in range(n):
            try:
                data += next(self.data)
            except StopIteration:
                break
        return data

