In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./tips.csv')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [3]:
lambda num: num*2

<function __main__.<lambda>(num)>

In [4]:
df['total_bill'].apply(lambda num: num*2)

0      33.98
1      20.68
2      42.02
3      47.36
4      49.18
       ...  
239    58.06
240    54.36
241    45.34
242    35.64
243    37.56
Name: total_bill, Length: 244, dtype: float64

In [5]:
def quality(total_bill, tip):
  if tip/total_bill > 0.25:
    return 'Generous'
  else:
    return 'Stingy person'

In [6]:
quality(16.99, 5)

'Generous'

## Tomando 2 columnas del dataframe para usarlas con el metodo apply de Pandas

In [7]:
# columnas a seleccionar,      # dataframe, # function to use and parameters,    # axis=1 => columns
df['Quality'] =  df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,Quality
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,Stingy person
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,Stingy person
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,Stingy person
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,Stingy person
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,Stingy person


## Tomando 2 columnas pero esta vez con Numpy

In [8]:
# nueva columna      # funcion a usar, # parametros de la funcion
df['Quality'] =  np.vectorize(quality)(df['total_bill'], df['tip'])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,Quality
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,Stingy person
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,Stingy person
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,Stingy person
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,Stingy person
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,Stingy person


## Haciendo benchmark para ver si la funcion o lambda es mas eficiente

In [9]:
import timeit

In [10]:
# code snippet to be executed only once 
setup = '''
import numpy as np
import pandas as pd 
df = pd.read_csv('tips.csv')
def quality (total_bill, tip):
  if tip / total_bill > 0.25:
    return "Generous"
  else: 
    return "Other"
'''

In [11]:
# code snippet whose execution time is to be measured 
stmt_one = '''
df['Quality'] =  df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)
'''

In [12]:
stmt_two = '''
df['Quality'] =  np.vectorize(quality)(df['total_bill'], df['tip'])
'''

In [13]:
  # ? Haciendo el benchmark para ver cual version es mas rapida: 
timeit.timeit(setup=setup, stmt=stmt_one, number=1000)

1.6839781760008918

In [14]:
timeit.timeit(setup=setup, stmt=stmt_two, number=1000)  # ! la version usando vectorize de numpy es mas eficiente!

0.13038935100121307