In [2]:
import pandas as pd
import numpy as np
import re
# ^^^ pyforest auto-imports - don't write above this line
import tqdm
import numpy as np
import pandas as pd


# Functional Paradigm Intro

What other paradigms we have experienced?

> <b> Procedural Programming </b>
- Instructions are procedures.
- Side effects are its core.

> <b> Objected Oriented Programming </b>
- Instructions are grouped as part of a state of an object.

> <b> Functional Programming </b>
- No state exists. Just a serie of functions being evaluated. 
- No side effects.
- The solution obtained is entirely based on the input. Like in math where <code>f(x) = y</code>
- This idea leads to the fact that you can also <b>pass functions as arguments</b>. And this helps a lot.


In [1]:
def add_one(x):
    return x + 1

In [2]:
x = 2

In [3]:
add_one(5)

6

In [5]:
f = add_one

In [7]:
f(7)

8

In [8]:
# functions can be thought as variables as well (!)
# add_one is just a name

f = add_one

In [9]:
# now f receives add_one 

f(10)

11

In [10]:
def add_two(x):
    return x + 2

In [12]:
add_two(5)

7

In [13]:
# so, if it can be thought as a variable, 
# can it be passed as an argument like any other variable? YES! 

def add_any(f, x):   
    return f(x)

In [17]:
add_any(add_two, 5)

7

# Function definition

```python
def function_name(arg1):
    something = arg1 + 10
    return something
```

# Mapping concept

In [18]:
# Simple list 
example_list = [10, 12, 34, 23, 2, 6, 7]

In [19]:
# define a function that performs any operation: 

def half(x):
    return x/2

In [20]:
half(10)

5.0

## How to apply that function to all elements of this list?

In [21]:
# you cant simply:

half(example_list)

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [22]:
example_list

[10, 12, 34, 23, 2, 6, 7]

In [23]:
# using a for loop
new_list = []

for item in example_list:
    new_list.append(half(item))
    
new_list 

[5.0, 6.0, 17.0, 11.5, 1.0, 3.0, 3.5]

In [24]:
# using list comprehensions
[half(item) for item in example_list]

[5.0, 6.0, 17.0, 11.5, 1.0, 3.0, 3.5]

In [25]:
example_list

[10, 12, 34, 23, 2, 6, 7]

In [26]:
# using mapping:

map(half, example_list)

# what it does when you map a function onto a list is the below: 

# [half(10), half(12), half(34), half(23), half(2), half(6), half(7)]

<map at 0x246898f5e80>

Map is called `lazy`. When you run `map(function, my_list)`, it doesn't execute anything. It just stores what it needs to perform. Whenever you call it, it washes out the result.

In [27]:
list(map(half, example_list))

[5.0, 6.0, 17.0, 11.5, 1.0, 3.0, 3.5]

# Lazy evaluation

Functional programming allows the idea of not calculating the whole function at once. 

These methods return only a `python object`. This haven't calculated nothing yet. As soon as you require the results, it calculates it.

In [28]:
map(half, example_list)

<map at 0x246898c8a58>

In [29]:
list(map(half, example_list))

[5.0, 6.0, 17.0, 11.5, 1.0, 3.0, 3.5]

In [30]:
for item in map(half, example_list):
    print(item)

5.0
6.0
17.0
11.5
1.0
3.0
3.5


In [32]:
tuple(map(half, example_list))

(5.0, 6.0, 17.0, 11.5, 1.0, 3.0, 3.5)

In [39]:
import re

def correct_andre(x):
    return re.sub('[Aa]ndr[eéÉE] [Aa]guiar', 'Andre Aguiar', x)
    

In [34]:
correct_andre('andré aguiar')

'Andre Aguiar'

In [35]:
my_list = ['Andre Aguiar','andre aguiar','andré aguiar','andrÉ Aguiar']

In [37]:
map(correct_andre, my_list)

<map at 0x24689925438>

In [40]:
list(map(correct_andre, my_list))

['Andre Aguiar', 'Andre Aguiar', 'Andre Aguiar', 'Andre Aguiar']

# Filter

`filter` helps removing elements of a list (or any iterator, anything you can run through) by passing a function that returns `True` or `False`. `filter` will also return a `python object`, but when you require it to show you the results, it will filter out every item that has return `False` on your function.

In [None]:
filter()

In [41]:
def check_if_even(x):
    """
    Return True if x is even, else return False"""
    
    
    return x % 2 == 0

In [44]:
example_list

[10, 12, 34, 23, 2, 6, 7]

In [45]:
filter(check_if_even, example_list)

<filter at 0x24689943f60>

In [46]:
list(filter(check_if_even, example_list))

[10, 12, 34, 2, 6]

In [47]:
list(map(check_if_even, example_list))

[True, True, True, False, True, True, False]

In [48]:
[item for item in example_list if check_if_even(item)]

[10, 12, 34, 2, 6]

# Reduce

Reduce brings the idea of an `accumulator`. Imagine you have a function that performs a `sum` for each pair of arguments. `reduce` (from the library `functools`) will consider the first argument of your function an `accumulator` and will run through your iterator recursively applying your function for pairs of items.

For example, for the list `[1,4,6,8]`

If you perform the following function:
```python
def sum_two_elements(a,b):
    return a+b
```

as 
```python
reduce( sum_two_elements, [1,4,6,8] )
```

The steps it will perform are:
```python
a = 0 # accumulator
b = 1 # value
a + b = 1 # so the accumulator receives this cummulative sum

a = 1 # accumulator
b = 4 # value
a + b = 5
...
a = 5 # accumulator
b = 6 # value 
a + b = 11
...
a = 11 # accumulator
b = 8 # value
a + b = 19

return 19
```

In [49]:
from functools import reduce

In [50]:
def sum_two_elements(a,b):
    print(f'a = {a}, b={b}')
    return a+b

In [51]:
reduce( sum_two_elements, [1,4,6,8])

a = 1, b=4
a = 5, b=6
a = 11, b=8


19

In [52]:
reduce( sum_two_elements, ['Andre ','Ribeiro ', 'de ', 'Barros ', 'Aguiar'])

a = Andre , b=Ribeiro 
a = Andre Ribeiro , b=de 
a = Andre Ribeiro de , b=Barros 
a = Andre Ribeiro de Barros , b=Aguiar


'Andre Ribeiro de Barros Aguiar'

In [53]:
''.join(['Andre ','Ribeiro ', 'de ', 'Barros ', 'Aguiar'])

'Andre Ribeiro de Barros Aguiar'

In [54]:
def my_sum(acc, value):
    print(acc, value)
    if acc % 2 == 0:
        return_value = acc+value
    else:
        return_value = acc

    return return_value

In [55]:
example_list

[10, 12, 34, 23, 2, 6, 7]

In [56]:
# sum up to the sum gets an odd value
reduce(my_sum, example_list)

10 12
22 34
56 23
79 2
79 6
79 7


79

# Mapping on Pandas

> <code> df['col_name'].apply() </code>

In [84]:
n = 100

In [85]:
df = pd.DataFrame(np.random.random(n), columns=['number'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [86]:
df

Unnamed: 0,number
0,0.869719
1,0.238852
2,0.738304
3,0.456564
4,0.394341
...,...
95,0.964062
96,0.531452
97,0.125024
98,0.863772


In [87]:
def greater_than_half(x):
    if x > 0.5:
        return 'ANDRE'
    else:
        return 'DIEGO'

In [88]:
greater_than_half(df['number'])

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [89]:
new_list = []

for i in range(df.shape[0]):
    new_list.append(greater_than_half(df.loc[i, 'number']))

In [95]:
df['number']

0     0.869719
1     0.238852
2     0.738304
3     0.456564
4     0.394341
        ...   
95    0.964062
96    0.531452
97    0.125024
98    0.863772
99    0.192944
Name: number, Length: 100, dtype: float64

In [94]:
df['number'].map(greater_than_half)

0     ANDRE
1     DIEGO
2     ANDRE
3     DIEGO
4     DIEGO
      ...  
95    ANDRE
96    ANDRE
97    DIEGO
98    ANDRE
99    DIEGO
Name: number, Length: 100, dtype: object

In [96]:
df['number'].apply(greater_than_half)

0     ANDRE
1     DIEGO
2     ANDRE
3     DIEGO
4     DIEGO
      ...  
95    ANDRE
96    ANDRE
97    DIEGO
98    ANDRE
99    DIEGO
Name: number, Length: 100, dtype: object

> Pandas Series have both `map` and `apply`. The most used, though, is the `apply` method. 

In [97]:
df['is_greater_than_half'] = df['number'].apply(greater_than_half)

In [98]:
df

Unnamed: 0,number,is_greater_than_half
0,0.869719,ANDRE
1,0.238852,DIEGO
2,0.738304,ANDRE
3,0.456564,DIEGO
4,0.394341,DIEGO
...,...,...
95,0.964062,ANDRE
96,0.531452,ANDRE
97,0.125024,DIEGO
98,0.863772,ANDRE


---

In [99]:
import re

In [108]:
names = ['andre', 'Andre', 'André','ANDRE','ANDRÉ', 'Joao', 'João','Carlos', 'Maria', 'Jose']
df = pd.DataFrame(np.random.choice(names, n), columns=['names'])
df

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,names
0,ANDRÉ
1,ANDRE
2,ANDRÉ
3,ANDRÉ
4,André
...,...
95,Carlos
96,Jose
97,Carlos
98,Maria


In [109]:
df['names'].value_counts()

Maria     15
ANDRE     12
Joao      12
Carlos    11
João      10
ANDRÉ     10
andre      9
Jose       8
André      7
Andre      6
Name: names, dtype: int64

In [110]:
## task: replace all occurrences of my name to Andre

In [116]:
def change_names(name):
    import unidecode
    string = unidecode.unidecode(name).lower()
    
    if re.search('andr[eé]', string):
        return re.sub('andr[eé]', 'Andre', string, flags=re.IGNORECASE)
    if re.search('jo[ãa]o', string):
        return re.sub('jo[ãa]o', 'João', string, flags=re.IGNORECASE)
    else:
        return string

In [117]:
df['names'].apply(change_names)

0      Andre
1      Andre
2      Andre
3      Andre
4      Andre
       ...  
95    carlos
96      jose
97    carlos
98     maria
99      João
Name: names, Length: 100, dtype: object

In [118]:
df['names'] = df['names'].apply(change_names)
df['names']

0      Andre
1      Andre
2      Andre
3      Andre
4      Andre
       ...  
95    carlos
96      jose
97    carlos
98     maria
99      João
Name: names, Length: 100, dtype: object

In [119]:
df['names'].value_counts()

Andre     44
João      22
maria     15
carlos    11
jose       8
Name: names, dtype: int64

# Apply functions with arguments.

In [120]:
def my_replace(x, index):
    """
    If index = 0, returns the name
    If index = 1, returns the profession
    """
    return x.replace('_',' ').split()[index]

In [121]:
example_df = pd.DataFrame({'names': ['Andre_LT','Matheus_TA','Joao_Student','Jose_Student']})

<IPython.core.display.Javascript object>

In [122]:
example_df

Unnamed: 0,names
0,Andre_LT
1,Matheus_TA
2,Joao_Student
3,Jose_Student


In [124]:
my_replace('Matheus_TA', 1)

'TA'

In [None]:
example_df['profissao'] = example_df['names'].apply(my_replace, index=1)
example_df['nome'] = example_df['names'].apply(my_replace, index=0)

# Apply in axis = 1

Whenever you map (apply) on a pandas dataframe using axis=1, you'll be able to have access to the rows of the dataframe on your function.

In [None]:
df = pd.DataFrame()
df['type'] = example_df['names'].apply(my_replace, index=1)
df['name'] = example_df['names'].apply(my_replace, index=0)
df['score'] = [6, 7, 8, 7]

In [None]:
df

In [None]:
def has_passed(row):
    if row['type'] == 'Student':
        if row['score'] > 7:
            return 'pass'
        else:
            return 'fail'
    else:
        if row['score'] > 6:
            return 'pass'
        else:
            return 'fail'        

In [None]:
df.apply(has_passed, axis=1)

In [None]:
import pandas as pd

pd.DataFrame([[1],[2]])

----

# Performance Test

In [127]:
def create_sample_dataframe(n_rows=1000000, n_cols=1):
    """
    Create a pandas dataframe containing n_rows rows and n_cols columns
    and mess up with it by changing the dots (.) by commas (,).
    """
    cpf = np.random.randint(1, 999999999, size=n_rows)
    variables = {f'column_{col_number}': np.random.random(n_rows) for col_number in range(n_cols)}
    variables.update({'CPF': cpf})  

    return pd.DataFrame(variables).applymap(lambda x : str(x).replace('.',','))

df = create_sample_dataframe()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [129]:
df

Unnamed: 0,column_0,CPF
0,018463551197385952,915509537
1,022692152218076,266951655
2,09929964437050065,181059964
3,07772411801646969,526169546
4,029472763406432456,466835610
...,...,...
999995,06080139601167653,473562112
999996,04261518535971709,938844972
999997,05250324280473356,637749423
999998,05430580196907607,546378994


In [126]:
def my_replace(x):
    return float(x.replace(',','.'))

In [130]:
df.column_0

0         0,18463551197385952
1            0,22692152218076
2          0,9929964437050065
3          0,7772411801646969
4         0,29472763406432456
                 ...         
999995     0,6080139601167653
999996     0,4261518535971709
999997     0,5250324280473356
999998     0,5430580196907607
999999     0,9563293394015341
Name: column_0, Length: 1000000, dtype: object

In [131]:
df.column_0.apply(my_replace)

0         0.184636
1         0.226922
2         0.992996
3         0.777241
4         0.294728
            ...   
999995    0.608014
999996    0.426152
999997    0.525032
999998    0.543058
999999    0.956329
Name: column_0, Length: 1000000, dtype: float64

In [132]:
from tqdm.auto import tqdm

for i in tqdm(range(df.shape[0])):
    df.loc[i, 'column_0'] = my_replace(df.loc[i, 'column_0'])

HBox(children=(FloatProgress(value=0.0, max=1000000.0), HTML(value='')))




KeyboardInterrupt: 

In [133]:
df = create_sample_dataframe()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [135]:
from tqdm.auto import tqdm
tqdm.pandas(desc="Applying transformation")
# now you canse use the method .progress_applymap

df.column_0.progress_apply(my_replace)

HBox(children=(FloatProgress(value=0.0, description='Applying transformation', max=1000000.0, style=ProgressSt…




0         0.940297
1         0.705018
2         0.926407
3         0.694293
4         0.256158
            ...   
999995    0.953718
999996    0.696772
999997    0.309967
999998    0.247160
999999    0.121357
Name: column_0, Length: 1000000, dtype: float64