# pandas函数应用

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df)) #绝对值

          0         1         2         3
0 -1.408781  2.576409 -1.622139 -1.366035
1 -0.210130  0.717216 -1.832954 -0.810010
2 -0.447475 -0.114799 -2.307896  0.561387
3 -0.811377 -0.641894 -2.306490  2.480790
4 -0.511282 -1.159709 -1.156054  0.624016
          0         1         2         3
0  1.408781  2.576409  1.622139  1.366035
1  0.210130  0.717216  1.832954  0.810010
2  0.447475  0.114799  2.307896  0.561387
3  0.811377  0.641894  2.306490  2.480790
4  0.511282  1.159709  1.156054  0.624016


In [4]:
# apply应用于每一列
print(df.apply(lambda x: np.max(x)))
print(df.apply(lambda x: np.mean(x)))

0   -0.210130
1    2.576409
2   -1.156054
3    2.480790
dtype: float64
0   -0.677809
1    0.275445
2   -1.845107
3    0.298030
dtype: float64


In [5]:
# 应用于行
print(df.apply(lambda x : x.max(), axis=1))

0    2.576409
1    0.717216
2    0.561387
3    2.480790
4    0.624016
dtype: float64


In [8]:
print(df.map(lambda x : '%.5f' % x))
df.dtypes

          0         1         2         3
0  -1.40878   2.57641  -1.62214  -1.36603
1  -0.21013   0.71722  -1.83295  -0.81001
2  -0.44747  -0.11480  -2.30790   0.56139
3  -0.81138  -0.64189  -2.30649   2.48079
4  -0.51128  -1.15971  -1.15605   0.62402


0    float64
1    float64
2    float64
3    float64
dtype: object

## 6.4 索引排序（不重要）

In [9]:
# Series
print(np.random.randint(5, size=5))
print('-'*50)
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5)) #索引随机生成
print(s4)
print('-'*50)
# 索引排序,sort_index返回一个新的排好索引的series
print(s4.sort_index())
print(s4)
# s4.loc[0:3]  loc索引值不唯一时直接报错
print(s4.iloc[0:3])
s4[0:3]  #默认用的位置索引

[4 3 3 0 3]
--------------------------------------------------
1    10
1    11
1    12
3    13
2    14
dtype: int64
--------------------------------------------------
1    10
1    11
1    12
2    14
3    13
dtype: int64
1    10
1    11
1    12
3    13
2    14
dtype: int64
1    10
1    11
1    12
dtype: int64


1    10
1    11
1    12
dtype: int64

In [10]:
s4.loc[1:2] #loc索引值唯一时可以切片

1    10
1    11
1    12
3    13
2    14
dtype: int64

In [13]:
# DataFrame
df4 = pd.DataFrame(np.random.randn(5, 5),
                   index=np.random.randint(5, size=5),
                   columns=np.random.randint(5, size=5))
print(df4)
#轴零是行索引排序
df4_isort = df4.sort_index(axis=1, ascending=False)
print(df4_isort)


          2         4         4         2         0
2  3.108992 -0.181214 -0.908408 -0.800330  0.508796
1 -1.100145 -0.154757  0.567607 -0.069871  0.794691
2  0.182736  0.798992  0.165390 -1.991710 -0.168656
2 -1.869802  0.459232 -0.103952  0.569686  0.687849
3  1.047680  0.700847 -2.172016  0.115370  0.125460
          4         4         2         2         0
2 -0.181214 -0.908408  3.108992 -0.800330  0.508796
1 -0.154757  0.567607 -1.100145 -0.069871  0.794691
2  0.798992  0.165390  0.182736 -1.991710 -0.168656
2  0.459232 -0.103952 -1.869802  0.569686  0.687849
3  0.700847 -2.172016  1.047680  0.115370  0.125460


# 6.5 按值排序（机器学习，深度学习不重要，数据分析才需要）

In [22]:
l = np.random.randint(0, 100,size=24)
print(l)
df4 = pd.DataFrame(l.reshape((4, 6)))
print('-' * 50)
print(df4)
print('-' * 50)
df4_vsort = df4.sort_values(by=5,axis=0, ascending=False) #寻找的是columns里的3,重要
print(df4_vsort)

[37 17 55 78 86 50 51 61 77 64 95 43 11 24 85 57 78 50 71 52 20 62 73 53]
--------------------------------------------------
    0   1   2   3   4   5
0  37  17  55  78  86  50
1  51  61  77  64  95  43
2  11  24  85  57  78  50
3  71  52  20  62  73  53
--------------------------------------------------
    0   1   2   3   4   5
3  71  52  20  62  73  53
0  37  17  55  78  86  50
2  11  24  85  57  78  50
1  51  61  77  64  95  43


In [23]:
df4.sort_values(5, ascending=False)

Unnamed: 0,0,1,2,3,4,5
3,71,52,20,62,73,53
0,37,17,55,78,86,50
2,11,24,85,57,78,50
1,51,61,77,64,95,43


In [24]:
df_data = pd.DataFrame([np.random.randn(3), [1., 2., np.nan],
                       [np.nan, 4., np.nan], [1., 2., 3.]])
print(df_data.head())

          0         1         2
0 -1.258593 -1.220174  0.997644
1  1.000000  2.000000       NaN
2       NaN  4.000000       NaN
3  1.000000  2.000000  3.000000


In [29]:
print(df_data.isnull())

       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False


In [30]:
print(df_data.isnull().sum())

0    1
1    0
2    2
dtype: int64


# 删除数据

In [44]:
#默认一个样本，任何一个特征缺失，就删除
#inplace True是修改的是原有的df
#subset=[0]是指按第一列来删除,第一列有空值就删除对应的行
print(df_data)
print('-'*50)
# print(df_data.dropna(subset=[0]))
# axis = 0时subset指定的是列，axis = 1时指定的是行
print(df_data.dropna(axis=0, subset=[0]))

          0         1         2
0 -1.258593 -1.220174  0.997644
1  1.000000  2.000000       NaN
2       NaN  4.000000       NaN
3  1.000000  2.000000  3.000000
--------------------------------------------------
          0         1         2
0 -1.258593 -1.220174  0.997644
1  1.000000  2.000000       NaN
3  1.000000  2.000000  3.000000


# 填充数据

In [47]:
#给零列的空值填为-100，按特征（按列）去填充
print(df_data.iloc[:,0].fillna(-100.))
df_data

0     -1.258593
1      1.000000
2   -100.000000
3      1.000000
Name: 0, dtype: float64


Unnamed: 0,0,1,2
0,-1.258593,-1.220174,0.997644
1,1.0,2.0,
2,,4.0,
3,1.0,2.0,3.0


In [53]:
df_data.iloc[:,2]=df_data.iloc[:,2].fillna(df_data.iloc[:,2].mean())#用均值填充空值
print(df_data)
# df_data.iloc[:,2] = df_data.iloc[:,2].fillna(df_data.iloc[:,2].mean())

          0         1         2
0 -1.258593 -1.220174  0.997644
1  1.000000  2.000000  1.998822
2       NaN  4.000000  1.998822
3  1.000000  2.000000  3.000000
