In [1]:
%load_ext Cython
import numpy as np




# cdef & cpdef

In [2]:
def function(x, y):
    return x**y

In [3]:
%timeit function(5, 6)

The slowest run took 11.16 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 311 ns per loop


In [4]:
%%cython

cdef cdef_funtion(x, y):
    return x**y



In [5]:
cdef_funtion(5, 6)

NameError: ignored

Cdef functions are not callable from non cython cells.

In [6]:
%%cython

cpdef cpdef_function(x,y):
    return x**y

In [7]:
%timeit cpdef_function(5,6)

The slowest run took 17.24 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 293 ns per loop


Not really faster. So introduce datatypes!

In [8]:
%%cython

cpdef int cpdef_dtypes_function(int x, int y):
    return x**y

In [9]:
%timeit cpdef_dtypes_function(5,6)

The slowest run took 26.16 times longer than the fastest. This could mean that an intermediate result is being cached.
10000000 loops, best of 5: 106 ns per loop


# Cython and Numpy Arrays

In [10]:
%%cython
import numpy as np

arr = np.arange(9, dtype=np.dtype("i")).reshape((3, 3))
print(arr)

cdef int [:, :] arr_view = arr
print(np.asarray(arr_view))

[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[0 1 2]
 [3 4 5]
 [6 7 8]]


In [11]:
%%cython 
import numpy as np

cdef int [:, :] arr_view = np.arange(9, dtype=np.dtype("i")).reshape((3, 3))
print(np.asarray(arr_view))
print(np.asarray(arr_view[1,1]))
arr_view[1,1] = 10
print(np.asarray(arr_view[1,1]))
arr_view[:,:] = 5
print(np.asarray(arr_view))


[[0 1 2]
 [3 4 5]
 [6 7 8]]
4
10
[[5 5 5]
 [5 5 5]
 [5 5 5]]


## Looping over arrays

In [12]:
x = np.arange(100, dtype=np.dtype("i")).reshape((10, 10))
y = np.full((10,10), 2, dtype=np.dtype("i"))

In [13]:
def array_op(x,y):
    result = np.zeros(x.shape)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            result[i,j] = x[i,j] * y[i,j] + i
    return result
    

In [14]:
%timeit result = array_op(x,y)

1000 loops, best of 5: 303 µs per loop


In [15]:
%%cython 
import numpy as np

cpdef cdef_array_op(x, y):
    result = np.zeros(x.shape)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            result[i,j] = x[i,j] * y[i,j] + i
    return result

In [16]:
%timeit cdef_array_op(x,y)

The slowest run took 4.85 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 281 µs per loop


In [17]:
%%cython 
import numpy as np

cpdef int[:,:] typed_array_op(int[:,:] x, int[:,:] y):
    cdef int[:,:] result = np.zeros((x.shape[0], x.shape[1]), dtype = np.dtype("i"))
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            result[i,j] = x[i,j] * y[i,j] + i
    return result



In [18]:
%timeit typed_array_op(x,y)

The slowest run took 11.11 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 5: 5.72 µs per loop


In [19]:
%%cython 
import numpy as np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef int[:,:] typed_unsafe_array_op(int[:,:] x, int[:,:] y):
    cdef int[:,:] result = np.zeros((x.shape[0], x.shape[1]), dtype = np.dtype("i"))
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            result[i,j] = x[i,j] * y[i,j] + i
    return result


In [20]:
%timeit typed_unsafe_array_op(x,y)

The slowest run took 11.65 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 5: 5.51 µs per loop


# Cython and Pandas

We downloaden eerst wat data...

In [21]:
!curl -L -c cookies.txt 'https://docs.google.com/uc?export=download&id=151gCztjHR_D2uIoebxfi52DZWGLabOQd' | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt    
!curl -L -b cookies.txt -o 'weatherAUS.zip' 'https://docs.google.com/uc?export=download&id=151gCztjHR_D2uIoebxfi52DZWGLabOQd&confirm='$(<confirm.txt)
!unzip weatherAUS.zip
!rm -f confirm.txt cookies.txt weatherAUS.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   388    0   388    0     0    177      0 --:--:--  0:00:02 --:--:--   177
100 3781k    0 3781k    0     0  1504k      0 --:--:--  0:00:02 --:--:-- 1504k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   408    0   408    0     0    173      0 --:--:--  0:00:02 --:--:--   173
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:02 --:--:--     0
100 3781k    0 3781k    0     0  1493k      0 --:--:--  0:00:02 --:--:-- 1493k
Archive:  weatherAUS.zip
  inflating: weatherAUS.csv          


In [22]:
import pandas as pd

In [23]:
df = pd.read_csv('weatherAUS.csv')

In [25]:
df.head()
# TODO: NaN's aanvullen

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,WNW,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,WSW,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,WSW,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,E,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,NW,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [43]:
def average_value(values):
  return np.mean(values)


In [65]:
%timeit df.head(10).apply(lambda x: average_value(x[11:13]), axis=1)


100 loops, best of 5: 3.02 ms per loop


In [66]:
%%cython
import numpy as np

cpdef c_average_value(values):
  return np.mean(values)


In [67]:
%timeit df.head(10).apply(lambda x: c_average_value(x[11:13]), axis=1)

100 loops, best of 5: 3.48 ms per loop


In [68]:
%%cython
import numpy as np

cpdef c_average_value_typed(int[:] values):
  return np.mean(values)

In [69]:
%timeit df.head(10).apply(lambda x: c_average_value_typed(x[11:13].to_numpy(dtype=np.dtype('i'))), axis=1)

100 loops, best of 5: 1.52 ms per loop


In [None]:
# TODO: Met for loop ipv .apply
