# 6 Pandas的函数应用

In [2]:
import pandas as pd
import numpy as np

# Numpy ufunc 函数，randn跟的是维数
df = pd.DataFrame(np.random.randn(5, 4) - 1)  # -1使得均值不再是零而是负一
print(df)

print(np.abs(df))  #绝对值

          0         1         2         3
0 -1.430866 -2.384597 -2.749806 -1.218744
1 -0.569369 -3.131028  0.939248  0.080419
2 -0.057630 -0.902494 -0.817345 -2.141111
3 -2.147976  0.627762  0.598381 -0.768369
4 -0.545082 -1.531376 -1.012408  0.118032
          0         1         2         3
0  1.430866  2.384597  2.749806  1.218744
1  0.569369  3.131028  0.939248  0.080419
2  0.057630  0.902494  0.817345  2.141111
3  2.147976  0.627762  0.598381  0.768369
4  0.545082  1.531376  1.012408  0.118032


In [3]:
# apply默认作用在列上,x是每一列,因为axis=0
print(df.apply(lambda x: x.max()))

0   -0.057630
1    0.627762
2    0.939248
3    0.118032
dtype: float64


In [4]:
#apply作用在行上
print(df.apply(lambda x: x.max(), axis=1))

0   -1.218744
1    0.939248
2   -0.057630
3    0.627762
4    0.118032
dtype: float64


In [5]:
# 使用applymap应用到每个数据
print(df.map(lambda x: '%.2f' % x))
df.dtypes

       0      1      2      3
0  -1.43  -2.38  -2.75  -1.22
1  -0.57  -3.13   0.94   0.08
2  -0.06  -0.90  -0.82  -2.14
3  -2.15   0.63   0.60  -0.77
4  -0.55  -1.53  -1.01   0.12


0    float64
1    float64
2    float64
3    float64
dtype: object

In [8]:
print(type('%.2f' % 1.3456))    # <class 'str'>
print('%.2f' % 1.3456)

<class 'str'>
1.35


## 6.4 索引排序（不重要）

In [11]:
# Series
print(np.random.randint(5, size=5))
print('-' * 50)
s4 = pd.Series(range(10, 15), index=np.random.randint(5, size=5))  #索 引随机生成
print(s4)
print('-' * 50)

# 索引排序,sort_index返回一个新的排好索引的series
print(s4.sort_index())
print(s4)

# s4.loc[0:3]  # loc索引值不唯一时直接报错
print(s4.iloc[0:3])
s4[0:3]  #默认用的位置索引

[0 1 2 3 3]
--------------------------------------------------
0    10
0    11
0    12
0    13
2    14
dtype: int64
--------------------------------------------------
0    10
0    11
0    12
0    13
2    14
dtype: int64
0    10
0    11
0    12
0    13
2    14
dtype: int64
0    10
0    11
0    12
dtype: int64


0    10
0    11
0    12
dtype: int64

In [12]:
s4.loc[1:2] # loc索引值唯一时可以切片

2    14
dtype: int64

In [15]:
# DataFrame
df4 = pd.DataFrame(np.random.randn(5, 5),
                   index=np.random.randint(5, size=5),
                   columns=np.random.randint(5, size=5))
print(df4)

#轴零是行索引排序
df4_isort = df4.sort_index(axis=0, ascending=False)
print(df4_isort)

          3         0         3         0         0
2 -0.291031  1.701857  1.266753 -0.634195 -0.575971
2 -0.404674  0.821214  0.042082  1.076502 -0.122473
4 -1.901006  0.995158 -0.004688 -1.702281  0.297635
0 -1.190299 -1.892105  1.038141 -2.351130 -0.011702
1 -0.469885  1.735051  0.257079  0.663088 -0.520622
          3         0         3         0         0
4 -1.901006  0.995158 -0.004688 -1.702281  0.297635
2 -0.291031  1.701857  1.266753 -0.634195 -0.575971
2 -0.404674  0.821214  0.042082  1.076502 -0.122473
1 -0.469885  1.735051  0.257079  0.663088 -0.520622
0 -1.190299 -1.892105  1.038141 -2.351130 -0.011702


In [16]:
# 轴1是列索引排序
df4_isort = df4.sort_index(axis=1, ascending=True)
print(df4_isort)

          0         0         0         3         3
2  1.701857 -0.634195 -0.575971 -0.291031  1.266753
2  0.821214  1.076502 -0.122473 -0.404674  0.042082
4  0.995158 -1.702281  0.297635 -1.901006 -0.004688
0 -1.892105 -2.351130 -0.011702 -1.190299  1.038141
1  1.735051  0.663088 -0.520622 -0.469885  0.257079


# 6.5 按值排序（机器学习，深度学习不重要，数据分析才需要）

In [17]:
# 按值排序,by后是column的值
import random

l = [random.randint(0, 100) for i in range(24)]  # 生成24个随机数
df4 = pd.DataFrame(np.array(l).reshape(6, 4))  # 生成6行4列的dataframe
print(df4)
print('-' * 50)

#按轴零排序，by后是列名,交换的是行
df4_vsort = df4.sort_values(by=3, axis=0, ascending=False)  # 寻找的是columns里的3,重要
print(df4_vsort)

    0   1   2   3
0  77  21  57  88
1  98  44  80  96
2  12   9  83   0
3  66  72  49  90
4  10  61  55  31
5  78  61  85  96
--------------------------------------------------
    0   1   2   3
1  98  44  80  96
5  78  61  85  96
3  66  72  49  90
0  77  21  57  88
4  10  61  55  31
2  12   9  83   0


In [18]:
# 按轴1排序，by后行索引名，交换的是列
df4_vsort = df4.sort_values(by=3, axis=1, ascending=False)  # 寻找的是index里的3
print(df4_vsort)

    3   1   0   2
0  88  21  77  57
1  96  44  98  80
2   0   9  12  83
3  90  72  66  49
4  31  61  10  55
5  96  61  78  85


# 6.6 处理缺失数据（重要）

In [19]:
df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
                        [np.nan, 4., np.nan], [1., 2., 3.]])
print(df_data.head())

          0         1       2
0 -0.696842 -0.651774 -0.7424
1  1.000000  2.000000     NaN
2       NaN  4.000000     NaN
3  1.000000  2.000000  3.0000


In [20]:
df_data.iloc[2, 0]

np.float64(nan)

In [21]:
# isnull来判断是否有空的数据
print(df_data.isnull())

       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False


In [22]:
#帮我计算df_data缺失率
print(df_data.isnull().sum() / len(df_data))

0    0.25
1    0.00
2    0.50
dtype: float64


## 删除缺失数据

In [24]:
# 默认一个样本，任何一个特征缺失，就删除
# inplace True是修改的是原有的df
# subset=[0]是指按第一列来删除,第一列有空值就删除对应的行
print(df_data.dropna(subset=[0]))

          0         1       2
0 -0.696842 -0.651774 -0.7424
1  1.000000  2.000000     NaN
3  1.000000  2.000000  3.0000


In [25]:
df_data

Unnamed: 0,0,1,2
0,-0.696842,-0.651774,-0.7424
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


In [26]:
# 用的不多，用在某个特征缺失太多时，才会进行删除
print(df_data.dropna(axis=1))  # 某列有nan就删除该列

          1
0 -0.651774
1  2.000000
2  4.000000
3  2.000000


In [27]:
df_data

Unnamed: 0,0,1,2
0,-0.696842,-0.651774,-0.7424
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


## 填充缺失数据

In [None]:
# 均值，中位数，众数填充

In [28]:
# 给零列的空值填为-100，按特征（按列）去填充
print(df_data.iloc[:, 0].fillna(-100.))
df_data

0     -0.696842
1      1.000000
2   -100.000000
3      1.000000
Name: 0, dtype: float64


Unnamed: 0,0,1,2
0,-0.696842,-0.651774,-0.7424
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


In [29]:
# 依次拿到每一列
for i in df_data.columns:
    print(df_data.loc[:, i])

0   -0.696842
1    1.000000
2         NaN
3    1.000000
Name: 0, dtype: float64
0   -0.651774
1    2.000000
2    4.000000
3    2.000000
Name: 1, dtype: float64
0   -0.7424
1       NaN
2       NaN
3    3.0000
Name: 2, dtype: float64


In [34]:
df_data.iloc[:, 0].fillna(-100., inplace=True)  # inplace=True后面会被删除

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_data.iloc[:, 0].fillna(-100., inplace=True)  # inplace=True后面会被删除


In [35]:
df_data.iloc[:, 2] = df_data.iloc[:, 2].fillna(df_data.iloc[:, 2].mean())  #用均值填充空值

In [36]:
df_data

Unnamed: 0,0,1,2
0,-0.696842,-0.651774,-0.7424
1,1.0,2.0,1.1288
2,-100.0,4.0,1.1288
3,1.0,2.0,3.0
