# File PreProcessing With C and Cython

#### *William Murphy*<br> *General Reinsurance Corporation*

---
*This jupyter notebook sets up an efficent way of extracting data from unstructured data sources.*

In [4]:
# - setup notebook display features
from IPython.display import IFrame
from IPython.display import HTML
from IPython.display import (display_pretty, display_html, display_jpeg, 
                             display_png, display_json, display_latex, display_svg)


html_begin = """

"""

html_framework = """
<body style="font color="blue";>
   <p> File PreProcessing With C and Cython</p>
</body>
"""
css_framework = """
<style>
    body:p {
        color: #007bff;
        font-size: 10px;
        
    }
</style>
"""
html_ = css_framework + html_framework 
print(html_)


HTML(html_)


<style>
    body:p {
        color: #007bff;
        font-size: 10px;
        
    }
</style>

<body style="font color="blue";>
   <p> File PreProcessing With C and Cython</p>
</body>



In [1]:
%load_ext cython

### Fast File I/O
---
*Using __Cython__, we can efficiently read in files for faster data processing.*

*In this notebook, we use __C__ and __Cython__ to:*
- __read in data__
- __perform tokenization__
- __NLP__
- __neural neyworks__

In [17]:
%%cython
# cython: boundscheck = False
import os
import numpy as np
cimport numpy as np
from libc.stdlib cimport malloc, realloc, free
from libc.stdio cimport fopen, fclose, FILE, EOF, fseek, SEEK_END, SEEK_SET
from libc.stdio cimport ftell, fgetc, fgets, getc, gets, feof, fread, getline
from libc.string cimport strlen, memcpy, strcpy, strtok
from cython.parallel import prange, parallel, threadid

cpdef int read_file(char *filename):
    """Efficiently read in a file"""

    cdef FILE *fp = NULL # create a file pointer
    fp = fopen(filename, "r")
    if fp == NULL:
        raise FileNotFoundError(2, "No such file or directory: '%s'" % filename)
    
    
    # file parsing variables
    cdef int c = 0
    cdef int num_bytes = 0
    # allocate 1000 bytes of memory
    #cdef char *s = <char*>malloc(1000*sizeof(char))
    
    # bypass the gil
    with nogil:
        while True:
            c = fgetc(fp)
            if feof(fp):
                break
            num_bytes += 1
    # close the file
    fclose(fp)
    return num_bytes
        
cpdef int cy_read(char *filename):
    """Read in a file"""
    cdef FILE *fp = NULL
    fp = fopen(filename, "r")
    if fp == NULL:
        raise FileNotFoundError(2, "No such file or directory: '%s'" % filename)
    cdef int flen = 0
    with nogil:
        fseek(fp, 0, SEEK_END)
        flen = ftell(fp)
        fclose(fp)
        return flen
    

cpdef cy_file_read(char *filename):
    cdef FILE *fp = NULL
    fp = fopen(filename, "r")
    if fp == NULL:
        raise FileNotFoundError(2, "No such file or directory: '%s'" % filename)
    # get the file length
    cdef long flen = 0
    cdef char *whole_text = NULL
    
    fseek(fp, 0, SEEK_END)
    flen = ftell(fp)
    fseek(fp, 0, SEEK_SET)
    whole_text = <char*>malloc(flen*sizeof(char))
    fread(whole_text, 1, flen, fp)
    fclose(fp)
    return whole_text
        
cpdef int cy_str_tok(char *filename, char *delimiter):
    cdef FILE *fp
    fp = fopen(filename, "r")
    if fp == NULL:
        raise FileNotFoundError(2, "No such file or directory: '%s'" % filename)
    
    # get the file length
    cdef long flen = 0
    cdef char *whole_text = NULL 
    fseek(fp, 0, SEEK_END)
    flen = ftell(fp)
    fseek(fp, 0, SEEK_SET)
    whole_text = <char*>malloc(flen*sizeof(char))
    fread(whole_text, 1, flen, fp)
    fclose(fp)
    
    # get the number of tokens
    cdef int tok_counter = 1
    cdef char *token
    token = strtok(whole_text, delimiter)
    with nogil:
        while token != NULL:
            token = strtok(NULL, delimiter)
            tok_counter += 1
    return tok_counter
    

    
def py_tokenize(filename, delimiter):
    "tokenize an input file"
    with open(filename, "r") as f:
        contents = f.read()
        counter = 1
        for c in contents:
            if c == delimiter:
                counter += 1
        return counter
    

emlFile = b"Y:\\Shared\\USD\\Business Data and Analytics\\Claims_Pipeline_Files\\Mapping_Files\\EmlMappingFile.csv"

# check this version(works as well)
#print(cy_file_read(emlFile))
#print(os.path.getsize(emlFile))
#print(read_file(emlMappingFile))
print(cy_str_tok(emlFile, ','))
print(py_tokenize(emlFile, ','))
#print(cy_file_read(emlFile))



8620
8565


### Using Cython Pointers

In [57]:
%%cython

# General purpose pointer - no type given
cdef void* p

# Store int data in p
cdef int x = 123
p = &x # p points to the memory address of x

# Now cast the pointer to an int
cdef int *iPtr = <int*>p
print(iPtr[0]) # get the value stored

# Utilizing funtion pointers
cdef double f(int x):
    return x / 10.0

# Using cython's ctypedef[ in C, it's the typedef keyword]
ctypedef double(*function_ptr)(int) # funt pointer that takes an int and returns a double

# x now points to f
cdef function_ptr fp
fp = &f
# Print output
print(fp(2))

123
0.2


### Faster File Reading With C and Cython

In [35]:
%%cython
# cython: language_level=3
# cython: embedsignature=True
# cython: profile=True
# cython: boundscheck=False
# coding: utf8
from libc.stdlib cimport malloc, realloc, free
from libc.stdio cimport fopen, fclose, FILE, EOF, fseek, SEEK_END, SEEK_SET
from libc.stdio cimport ftell, fgetc, fgets, getc, gets, feof, fread, getline
from libc.string cimport strlen, memcpy, strcpy, strtok, strchr, strncpy
from cython.parallel import prange, parallel, threadid
from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free

# preprocessor directive
DEF BUFFER = 100

# template for numeric types
ctypedef fused numeric_var:
    int
    long
    long long
    float
    double

# Token is a c level struct that stores token information
cdef readonly struct Token:
    char **token_array
    int *token_size_array
    
cdef readonly struct Columns:
    char **column_array
    
cdef readonly struct Rows:
    Columns *Col
    
cdef readonly struct DataContainer:
    char ***data_frame
    char *columns
    char **data_array
    

    
cdef class Tokenize:
    """Tokenize the input file/string"""
    cdef Token *Tok
    cdef FILE *fp
    cdef readonly:
        char *filename
        char *column_header
        char *delimiter
        char newline
        char *file_contents
        char current_char
        int iterator
        int c
        int num_columns
        long file_size
        int num_tokens
        bint is_open
        bint EO_STR
        
    def __init__(self, char *delimiter, char *filename):
        self.Tok = <Token*>malloc(sizeof(Token))
        self.delimiter = delimiter
        self.newline = b"\n"
        self.is_open = 0
        self.fp = NULL
        self.filename = filename
        self.column_header = NULL
        self.file_contents = NULL
        self.file_size = 0
        self.num_tokens = 0
        self.iterator = 0
        self.current_char = b" "
        self.c = 0
        self.EO_STR = 0
        self.num_columns = 0
        
    def open_file(self):
        """Open the file for reading"""
        self.fp = fopen(self.filename, "r")
        if self.fp == NULL:
            raise FileNotFoundError(2, "No such file or directory: '%s'" % self.filename)
        else:
            # file is now open
            self.is_open = 1
        
    def  read_in_file(self):
        """get the file size"""
        if self.is_open == 1:
            # get the file length
            fseek(self.fp, 0, SEEK_END)
            self.file_size = ftell(self.fp)
            fseek(self.fp, 0, SEEK_SET)
            # allocate memory for reading the file
            self.file_contents = <char*>malloc(self.file_size*sizeof(char))
            # read in the entire file
            fread(self.file_contents, 1, self.file_size, self.fp)
            # close the file once it's read into a char array
            fclose(self.fp)
            # set is open to 0
            self.is_open = 0
    
    def get_columns(self):
        """set up the column names"""
        if self.file_contents != NULL:
            tmp = 0
            while True:
                print(<str>chr(self.file_contents[self.iterator]))
                if <str>chr(self.file_contents[self.iterator]) == "\n":
                    self.num_columns += 1
                    
                    break
                if <str>chr(self.file_contents[self.iterator])== "\0":
                    self.EO_STR = 0 # end of string reached
                    break
                if <str>chr(self.file_contents[self.iterator])== ",":
                    self.num_columns += 1  
                    tmp = self.iterator
                self.iterator += 1
            #print("iterator position: {}".format(self.iterator))
            #print("tmp = {}".format(tmp))
            
            # allocate memory for the column information
            self.column_header = <char*>malloc(self.iterator*sizeof(char))
            strncpy(self.column_header, self.file_contents, self.iterator)
            
            #self.current_char = self.file_contents[0]
            #self.iterator += 1
            #while self.current_char != self.newline:
            #    self.current_char = self.file_contents[self.iterator]
            #    self.iterator += 1
            #self.column_header = strncpy(self.column_header, self.file_contents, self.iterator)
                        
    def __dealloc__(self):
        """Deallocate memory when finished"""
        pass
    
    cdef void reallocate_mem(self, size_t mem):
        """reallocate memory"""
        pass
    

        
       
        

# test the class
emlFile = b"Y:\\Shared\\USD\\Business Data and Analytics\\Claims_Pipeline_Files\\Mapping_Files\\EmlMappingFile.csv"
# create an instance of the tokenizer class
tokenizer = Tokenize(b',', emlFile)
tokenizer.open_file()
tokenizer.read_in_file()
tokenizer.get_columns()
print(tokenizer.file_size)
print(tokenizer.is_open)
print(tokenizer.column_header)
#print(tokenizer.file_contents)

class PyTokenize(Tokenize):
    """
    Python wrapper around the Cython
    Tokenize class.
    """
    def __init__(self):
        super().__init__(b',', emlFile)
       
    
    
    
    
    
    
    
    
    
    

c
l
a
i
m
_
i
d
,
f
i
l
e
s


527391
False
b'claim_id,filess\\Public'


In [41]:
import codecs
import os
emlMappingFile = r'Y:\Shared\USD\Business Data and Analytics\Claims_Pipeline_Files\Mapping_Files\EmlMappingFile.csv'
encodedFile = codecs.encode(emlMappingFile, "utf-8")

def py_read(filename):
    with open(filename, "r") as f:
        return f.read()



pytok = PyTokenize()
pytok.open_file()
%timeit pytok.read_in_file() # 90.3ns = 0.0000903 ms
%timeit py_read(emlFile) # 3.24ms = 3240000 ns
# speed increase:  35880398671.1%!!



90.3 ns ± 3.18 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
3.24 ms ± 93 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
import codecs
emlMappingFile = r'Y:\Shared\USD\Business Data and Analytics\Claims_Pipeline_Files\Mapping_Files\EmlMappingFile.csv'
#print(read_file(emlMappingFile.encode('utf-8')))
#os.path.getsize(emlMappingFile)
encodedFile = codecs.encode(emlMappingFile, "utf-8")

%timeit cy_read(encodedFile) # cy_read is much faster than os.path.getsize
%timeit os.path.getsize(encodedFile)

# 186 µs ± 2.04 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

188 µs ± 3.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
656 µs ± 102 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [42]:
import codecs
emlMappingFile = r'Y:\Shared\USD\Business Data and Analytics\Claims_Pipeline_Files\Mapping_Files\EmlMappingFile.csv'
encodedFile = codecs.encode(emlMappingFile, "utf-8")

%timeit cy_file_read(encodedFile)
%timeit py_file_read(encodedFile)

1.62 ms ± 45.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.04 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
