In [1]:
# 自定义函数学习目标
# 1、掌握apply的用法
# 2、知道如何创建向量化函数

In [25]:
# 导入Pandas包
import pandas as pd

# 导入numpy包
import numpy as np

In [4]:
# 创建数据
data = {
    'a': [10, 20, 30],
    'b': [20, 30, 40],
}
df = pd.DataFrame(data)
df

In [5]:
# 创建一个自定义函数（单个参数）
def my_sq(x):
    """
    求平方
    :param x: 
    :return: 
    """
    return x ** 2


In [8]:
df['square'] = df['b'].apply(my_sq)
df

In [9]:
# 创建一个自定义函数（单个参数）
def my_power(base, index):
    """
    求平方
    :param x: 
    :return: 
    """
    return base ** index


In [10]:
df['power_two'] = df['a'].apply(my_power, index=2)
df

In [11]:
df['power_three'] = df['a'].apply(my_power, index=3)
df

In [13]:
# DataFrame的apply方法
df.apply(my_sq)


In [18]:
def my_avg(col):
    x = col[0]
    y = col[1]
    z = col[2]
    """
    编写函数计算列的平均值
    :param x: 
    :param y: 
    :param z: 
    :return: 平均值
    """
    return (x + y + z) / 3

In [19]:
df.apply(my_avg)

In [22]:
def my_avg_row(row):
    """
    按行处理求平均值
    :param row: 
    :return: 行平均值
    """
    x = row[0]
    y = row[1]
    return (x + y) / 2

In [21]:
df.apply(my_avg_row, axis=1)

In [24]:
# apply 使用案例

# 1、加载泰坦尼克号数据
titanic = pd.read_csv('../data/titanic.csv')
titanic.info()
# 数据891行、15列、age、embarked、deck、embark_town包含缺失值

In [35]:
def count_missing(vector):
    """
    计算向量中缺省值的个数
    :param vector: 
    :return: 
    """
    # 获取向量中的为null值
    null_vector = pd.isnull(vector)
    # np函数求向量true值数量
    null_count = np.sum(null_vector)
    return null_count

In [31]:
def prop_missing(vector):
    """
    计算向量中缺省值的占比
    :param vector: 
    :return: 
    """
    # 计算向量中null值个数
    num = count_missing(vector)
    # 获取向量个数
    size = vector.size
    return num / size

In [32]:
def prop_complete(vector):
    """
    计算向量中非缺省值的占比
    :param vector: 
    :return: 
    """
    # 获取缺省值的占比
    pm = prop_missing(vector)
    return 1 - pm

In [36]:
titanic.apply(count_missing)

In [37]:
titanic.apply(prop_missing)

In [38]:
titanic.apply(prop_complete)


In [39]:
titanic.apply(count_missing, axis=1)


In [40]:
titanic.apply(prop_missing, axis=1)


In [41]:
titanic.apply(prop_complete, axis=1)


In [42]:
titanic.apply(count_missing, axis=1).value_counts()


In [44]:
# 向量化函数
data = {
    'a': [10, 20, 30],
    'b': [20, 30, 40]
}

df2 = pd.DataFrame(data)
df2

In [45]:
def my_avg2(x, y):
    """
    计算平均值
    :param x: 
    :param y: 
    :return: 
    """
    if 20 == x:
        return np.NaN
    else:
        return (x + y) / 2

In [46]:
# 1、将函数向量化
avg_mod_vector = np.vectorize(my_avg2)

# 2、对向量使用平均值函数
avg_mod_vector(df2['a'], df2['b'])

In [47]:
@np.vectorize
def my_avg_decorator(x, y):
    """
    计算平均值
    :param x: 
    :param y: 
    :return: 
    """
    if 20 == x:
        return np.NaN
    else:
        return (x + y) / 2

In [48]:
# 使用np.vectorize装饰器将函数向量化
my_avg_decorator(df2['a'], df2['b'])

In [49]:
# lambda函数

In [51]:
df2.apply(lambda x: x+1)
