<a href="https://colab.research.google.com/github/BaseKan/aiday_training_resources/blob/harvest-talent-presents/Cython/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
%load_ext Cython
import numpy as np
import pandas
from math import sin,tan,cos

# Primes

In [None]:
def primes(nb_primes):
  p = []
  n = 2
  while len(p) < nb_primes:
    # Is n prime?
    for i in p:
      if n % i == 0:
          break

    # If no break occurred in the loop
    else:
      p.append(n)
    n += 1
  return p

In [None]:
primes(10)

In [None]:
%timeit primes(1000)

In [None]:
%%cython
def primes_compiled(nb_primes):
  p = []
  n = 2
  while len(p) < nb_primes:
    # Is n prime?
    for i in p:
      if n % i == 0:
          break

    # If no break occurred in the loop
    else:
      p.append(n)
    n += 1
  return p

In [None]:
%timeit primes_compiled(1000)

In [None]:
%%cython --annotate
def primes_compiled(nb_primes):
  p = []
  n = 2
  while len(p) < nb_primes:
    # Is n prime?
    for i in p:
      if n % i == 0:
          break

    # If no break occurred in the loop
    else:
      p.append(n)
    n += 1
  return p

In [None]:
%%cython --annotate
def primes_cython(int nb_primes):
  cdef int n, i, len_p
  cdef int p[1000]
  if nb_primes > 1000:
    nb_primes = 1000

  len_p = 0  # The current number of elements in p.
  n = 2
  while len_p < nb_primes:
    # Is n prime?
    for i in p[:len_p]:
      if n % i == 0:
        break

    # If no break occurred in the loop, we have a prime.
    else:
      p[len_p] = n
      len_p += 1
    n += 1

  # Let's return the result in a python list:
  result_as_list  = [prime for prime in p[:len_p]]
  return result_as_list

In [None]:
%%cython --annotate
cimport cython

@cython.cdivision(True)
def primes_cython(int nb_primes):
  cdef int n, i, len_p
  cdef int p[1000]
  if nb_primes > 1000:
    nb_primes = 1000

  len_p = 0  # The current number of elements in p.
  n = 2
  while len_p < nb_primes:
    # Is n prime?
    for i in p[:len_p]:
      if n % i == 0:
        break

    # If no break occurred in the loop, we have a prime.
    else:
      p[len_p] = n
      len_p += 1
    n += 1

  # Let's return the result in a python list:
  result_as_list  = [prime for prime in p[:len_p]]
  return result_as_list

In [None]:
%timeit primes_cython(1000)

In [None]:
%timeit primes(1000)

# Opdracht: np.vectorize naar Cython

De onderstaande code doet een berekening over paren van waardes, afhankelijk van een conditie.

In [None]:
def complicated_calculation(x,y):
  if x > 0.5*y and y < 0.3:
      res = sin(x-y)
  elif x < 0.5*y:
      res = tan(x+y)
  elif x > 0.2*y:
      res = sin(x)*np.sin(y)
  else:
      res = cos(x/(0.1+abs(y)))
  return res

In [None]:
def get_results(x,y):
  return np.vectorize(complicated_calculation)(x,y)

In [None]:
x = np.random.randn(int(1e6))
y = np.random.randn(int(1e6))

In [None]:
%timeit get_results(x, y)

Het kan echter een stuk sneller in Cython. Vul de onderstaande code aan. In plaats van np.vectorize kun je een for loop gebruiken in Cython. Ook kan het een stuk beter door types toe te voegen. 

Een eerste stap is al gemaakt door de sin, cos, tan en abs operaties te vervangen door equivalente operaties in C.

In [None]:
%%cython --annotate
cimport cython
import numpy as np
cimport numpy as np
from libc.math cimport sin, cos, tan, fabs

def complicated_calculation(x,y):
  if x > 0.5*y and y < 0.3:
      res = sin(x-y)
  elif x < 0.5*y:
      res = tan(x+y)
  elif x > 0.2*y:
      res = sin(x)*sin(y)
  else:
      res = cos(x/(0.1+fabs(y)))
  return res

def c_get_results_fast(x, y):
  # TODO: For loop toevoegen
  return res

In [None]:
%timeit c_get_results_fast(x, y)

# Opdracht: Pandas

We downloaden eerst wat data. Ook vullen we wat missende waardes in en voegen we een kolom toe.

In [None]:
!curl -L -c cookies.txt 'https://docs.google.com/uc?export=download&id=151gCztjHR_D2uIoebxfi52DZWGLabOQd' | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt    
!curl -L -b cookies.txt -o 'weatherAUS.zip' 'https://docs.google.com/uc?export=download&id=151gCztjHR_D2uIoebxfi52DZWGLabOQd&confirm='$(<confirm.txt)
!unzip weatherAUS.zip
!rm -f confirm.txt cookies.txt weatherAUS.zip

In [None]:
df = pd.read_csv('weatherAUS.csv')

In [None]:
df = df.apply(lambda x: x.fillna(x.mean()) if x.dtype == 'float64' else x,
              axis=0)
df.Date = pd.to_datetime(df.Date)

df = df.sort_values('Date').reset_index(drop=True)
df.head()

In [None]:
df['AvgTemp'] = df[['MinTemp', 'MaxTemp']].mean(axis=1)

Zet de volgende Pandas code om naar Cython.

In [None]:
def classify_temperature(df):
  bins = [df.AvgTemp.describe()['min'], df.AvgTemp.describe()['25%'], df.AvgTemp.describe()['75%'], df.AvgTemp.describe()['max']]
  labels = ['cold','average','hot']
  df['TempType'] = pd.cut(df['AvgTemp'], bins=bins, labels=labels)
  return df

In [None]:
%timeit classify_temperature(df)

Een deel is al gedaan voor je.

Hint: Het type van een python string is `object`.

In [None]:
%%cython
def c_classify_temperature_col(double[:] avg_temp):
  cold = np.quantile(avg_temp, 0.25)
  hot = np.quantile(avg_temp, 0.75)
  # YOUR CODE HERE

def c_classify_temperature(df):
  # YOUR CODE HERE


# Extra materiaal

Cython documentatie: https://cython.readthedocs.io/en/latest/

Pandas optimalisatie: https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html

Just in time compiler: https://numba.pydata.org/

