<a href="https://colab.research.google.com/github/BaseKan/aiday_training_resources/blob/main/Cython/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%load_ext Cython
import numpy as np
from math import sin,tan,cos

# Fibonnaci

In [2]:
%%cython

def c_fib_typed(int n):
  """Return the Fibonacci series up to n."""
  cdef int a, b, len_r
  cdef int r[1000]
  a = 0
  b = 1
  len_r = 0
  while b < n:
    r[len_r] = b
    len_r += 1
    a, b = b, a + b
        
  r_as_list  = [item for item in r[:len_r]]
  return r_as_list

In [5]:
%%timeit
x = c_fib_typed(2000)

The slowest run took 10.58 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 312 ns per loop


Make It Work Make It Right Make It Fast

# Cython and Numpy Arrays

UITLEG MEMORY VIEWS


In [None]:
%%cython
import numpy as np

arr = np.arange(9, dtype=np.dtype("i")).reshape((3, 3))
print(arr)

cdef int [:, :] arr_view = arr
print(np.asarray(arr_view))

In [None]:
%%cython 
import numpy as np

cdef int [:, :] arr_view = np.arange(9, dtype=np.dtype("i")).reshape((3, 3))
print(np.asarray(arr_view))
print(np.asarray(arr_view[1,1]))
arr_view[1,1] = 10
print(np.asarray(arr_view[1,1]))
arr_view[:,:] = 5
print(np.asarray(arr_view))


## Arrays en for loops

UITLEG FOR LOOPS IN C

In [None]:
x = np.arange(100, dtype=np.dtype("i")).reshape((10, 10))
y = np.full((10,10), 2, dtype=np.dtype("i"))

In [None]:
def array_op(x,y):
  result = np.zeros(x.shape)
  for i in range(x.shape[0]):
      for j in range(x.shape[1]):
          result[i,j] = x[i,j] * y[i,j] + i
  return result
    

In [None]:
%timeit result = array_op(x,y)

In [None]:
%%cython 
import numpy as np

def c_array_op(x, y):
    result = np.zeros(x.shape)
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            result[i,j] = x[i,j] * y[i,j] + i
    return result

In [None]:
%timeit c_array_op(x,y)

In [None]:
%%cython 
import numpy as np

def typed_array_op(int[:,:] x, int[:,:] y):
    cdef int[:,:] result = np.zeros((x.shape[0], x.shape[1]), dtype = np.dtype("i"))
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            result[i,j] = x[i,j] * y[i,j] + i
    return result

In [None]:
%timeit typed_array_op(x,y)

In [None]:
%%cython 
import numpy as np
cimport cython

@cython.boundscheck(False)
@cython.wraparound(False)
def typed_unsafe_array_op(int[:,:] x, int[:,:] y):
    cdef int[:,:] result = np.zeros((x.shape[0], x.shape[1]), dtype = np.dtype("i"))
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            result[i,j] = x[i,j] * y[i,j] + i
    return result


In [None]:
%timeit typed_unsafe_array_op(x,y)

# Opdracht: np.vectorize naar Cython

De onderstaande code is de oplossing van één van de opdrachten van de Optimalisatie kennissessie. Met np.vectorize is die al een stuk sneller gemaakt. 

In [18]:
def complicated_calculation(x,y):
  if x > 0.5*y and y < 0.3:
      res = sin(x-y)
  elif x < 0.5*y:
      res = tan(x+y)
  elif x > 0.2*y:
      res = sin(x)*np.sin(y)
  else:
      res = cos(x/(0.1+abs(y)))
  return res

In [19]:
def get_results_fast(x,y):
  return np.vectorize(complicated_calculation)(x,y)

In [20]:
x = np.random.randn(int(1e6))
y = np.random.randn(int(1e6))

In [21]:
%timeit res_fast = get_results_fast(x, y)

1 loop, best of 5: 592 ms per loop


Het kan echter nog een stuk sneller in Cython. Vul de onderstaande code aan. In plaats van np.vectorize kun je een for loop gebruiken in Cython. Ook kan het een stuk beter door types toe te voegen. 

Een eerste stap is al gemaakt door de sin, cos, tan en abs operaties te vervangen door equivalente operaties in C.

In [None]:
%%cython --annotate
cimport cython
import numpy as np
cimport numpy as np
from libc.math cimport sin, cos, tan, fabs

def complicated_calculation(x,y):
  if x > 0.5*y and y < 0.3:
      res = sin(x-y)
  elif x < 0.5*y:
      res = tan(x+y)
  elif x > 0.2*y:
      res = sin(x)*sin(y)
  else:
      res = cos(x/(0.1+fabs(y)))
  return res

def c_get_results_fast(x, y):
  # TODO: For loop toevoegen
  return res

In [None]:
%timeit res_fast = c_get_results_fast(x, y)

# Cython and Pandas

We downloaden eerst wat data...

In [7]:
!curl -L -c cookies.txt 'https://docs.google.com/uc?export=download&id=151gCztjHR_D2uIoebxfi52DZWGLabOQd' | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt    
!curl -L -b cookies.txt -o 'weatherAUS.zip' 'https://docs.google.com/uc?export=download&id=151gCztjHR_D2uIoebxfi52DZWGLabOQd&confirm='$(<confirm.txt)
!unzip weatherAUS.zip
!rm -f confirm.txt cookies.txt weatherAUS.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   388    0   388    0     0    162      0 --:--:--  0:00:02 --:--:--   162
100 3781k    0 3781k    0     0  1352k      0 --:--:--  0:00:02 --:--:-- 54.7M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   408    0   408    0     0    241      0 --:--:--  0:00:01 --:--:--   241
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
100 3781k    0 3781k    0     0  1879k      0 --:--:--  0:00:02 --:--:-- 1879k
Archive:  weatherAUS.zip
replace weatherAUS.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: weatherAUS.csv          


In [8]:
import pandas as pd

In [30]:
df = pd.read_csv('weatherAUS.csv')

In [31]:
df = df.apply(lambda x: x.fillna(x.mean()) if x.dtype == 'float64' else x,
              axis=0)
df.Date = pd.to_datetime(df.Date)

df = df.sort_values('Date').reset_index(drop=True)
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2007-11-01,Canberra,8.0,24.3,0.0,3.4,6.3,NW,30.0,SW,NW,6.0,20.0,68.0,29.0,1019.7,1015.0,7.0,7.0,14.4,23.6,No,Yes
1,2007-11-02,Canberra,14.0,26.9,3.6,4.4,9.7,ENE,39.0,E,W,4.0,17.0,80.0,36.0,1012.4,1008.4,5.0,3.0,17.5,25.7,Yes,Yes
2,2007-11-03,Canberra,13.7,23.4,3.6,5.8,3.3,NW,85.0,N,NNE,6.0,6.0,82.0,69.0,1009.5,1007.2,8.0,7.0,15.4,20.2,Yes,Yes
3,2007-11-04,Canberra,13.3,15.5,39.8,7.2,9.1,NW,54.0,WNW,W,30.0,24.0,62.0,56.0,1005.5,1007.0,2.0,7.0,13.5,14.1,Yes,Yes
4,2007-11-05,Canberra,7.6,16.1,2.8,5.6,10.6,SSE,50.0,SSE,ESE,20.0,28.0,68.0,49.0,1018.3,1018.5,7.0,7.0,11.1,15.4,Yes,No


In [11]:
def average_value(values):
  return np.mean(values)


In [12]:
%timeit df.head(100).apply(lambda x: average_value(x[11:13]), axis=1)


10 loops, best of 5: 23.8 ms per loop


In [18]:
%%cython
import numpy as np

def c_average_value(values):
  return np.mean(values)


In [19]:
%timeit df.head(100).apply(lambda x: c_average_value(x[11:13]), axis=1)

10 loops, best of 5: 19.2 ms per loop


In [20]:
%%cython
import numpy as np
cimport numpy as np

def c_average_value_typed(double[:] values):
  return np.mean(values)

In [21]:
%timeit df.head(100).apply(lambda x: c_average_value_typed(x[11:13].to_numpy(dtype=np.dtype('d'))), axis=1)

100 loops, best of 5: 7.43 ms per loop


TEKST OVER PROFILING

In [22]:
%%cython --annotate
cimport cython
cimport numpy as np
import numpy as np

cdef double c_average_value_typed(double[:] values):
  return np.mean(values)

def c_apply_average_value(double[:,:] df_cols):
  n = df_cols.shape[0]
  cdef double[:] res = np.empty(n, dtype=np.dtype('d'))
  for i in range(n):
    res[i] = c_average_value_typed(df_cols[i])

  return res

In [23]:
%timeit c_apply_average_value(df.head(100)[df.columns[11:13]].to_numpy(dtype=np.dtype('d'))) 

1000 loops, best of 5: 1.76 ms per loop


# Opdracht: 

Zet de snelle code van Lisa in Cython.

In [35]:
df['AvgTemp'] = df[['MinTemp', 'MaxTemp']].mean(axis=1)

In [36]:
def classify_temperature(df):
  bins = [df.AvgTemp.describe()['min'], df.AvgTemp.describe()['25%'], df.AvgTemp.describe()['75%'], df.AvgTemp.describe()['max']]
  labels = ['cold','average','hot']
  df['TempType'] = pd.cut(df['AvgTemp'], bins=bins, labels=labels)
  return df

In [37]:
%%timeit
classify_temperature(df)

10 loops, best of 5: 31.3 ms per loop


Een deel is al gedaan voor je.

Hint: Het type van een python string is `object`.

In [None]:
%%cython
def c_classify_temperature(double[:] avg_temp):
  cold = np.quantile(avg_temp, 0.25)
  hot = np.quantile(avg_temp, 0.75)
  # YOUR CODE HERE
  