# Pandas数据操作

In [2]:
import pandas as pd

* Series索引

In [3]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())

a    0
b    1
c    2
d    3
e    4
dtype: int32


In [4]:
# 行索引
print(ser_obj['a'])
print(ser_obj[0])

0
0


In [5]:
# 切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])

b    1
c    2
dtype: int32
b    1
c    2
d    3
dtype: int32


In [6]:
# 不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])

a    0
c    2
e    4
dtype: int32
a    0
e    4
dtype: int32


In [7]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int32
d    3
e    4
dtype: int32


* DataFrame索引

In [20]:
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'ob', 'c', 'd'])
print(df_obj.head())

          a        ob         c         d
0  0.182068  0.096596  0.748497  0.544490
1 -0.953892  0.471331  0.890098 -1.136070
2  0.139373  0.167147 -1.154112 -1.010311
3 -1.057192  0.304619 -0.349571  1.739527
4 -0.993317  0.774059 -1.411880  0.079991


In [17]:
# 列索引

print(df_obj.iloc[2:3,[1,3]])

          b         d
2  0.476726 -0.602308


* 三种索引方式

In [18]:
print(ser_obj)
print(df_obj)

a    0
b    1
c    2
d    3
e    4
dtype: int32
          a         b         c         d
0  0.195143 -1.335159  1.449059 -0.599455
1 -1.072482  0.065156 -2.385252  0.249004
2  1.299029  0.476726  1.012382 -0.602308
3 -0.052435  1.016986 -1.398035 -1.107173
4 -0.863136 -0.028784  0.073985  0.440905


In [21]:
# 标签索引 loc
# Series
#print(ser_obj['b':'d'])
#print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])

0    0.182068
1   -0.953892
2    0.139373
3   -1.057192
4   -0.993317
Name: a, dtype: float64
0    0.182068
1   -0.953892
2    0.139373
Name: a, dtype: float64


In [14]:
print(ser_obj)

a    0
b    1
c    2
d    3
e    4
dtype: int32


In [1]:
# 整型位置索引 iloc
#print(ser_obj['b':'d'])
print(ser_obj.loc[1:3])

# DataFrame
#print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别

NameError: name 'ser_obj' is not defined

In [19]:
print(ser_obj)

a    0
b    1
c    2
d    3
e    4
dtype: int32


In [None]:
# 混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])

# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作，然后再按位置索引尝试操作

* 运算与对齐

In [23]:
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int32


In [21]:
# Series 对齐运算
s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

In [24]:
import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [25]:
# DataFrame对齐操作
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


In [26]:
# 填充未对齐的数据进行运算
print(s1)
print(s2)

s1.add(s2, fill_value = -1)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32
0    20
1    21
2    22
3    23
4    24
dtype: int32


0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64

In [None]:
df1.sub(df2, fill_value = 2.)

In [None]:
# 填充NaN
s3 = s1 + s2
print(s3)

In [None]:
s3_filled = s3.fillna(-1)
print(s3_filled)

In [None]:
df3 = df1 + df2
print(df3)

In [None]:
df3.fillna(100, inplace = True)
print(df3)

* 函数应用

In [23]:
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))

          0         1         2         3
0 -1.993081  0.152272 -0.668028 -0.251798
1 -2.289843 -0.840904 -0.734763 -1.558692
2 -1.639470 -1.561671 -0.780086 -2.040554
3 -1.703188 -3.163751 -1.459251 -2.578217
4 -1.395551 -0.506156 -0.380774  0.262559
          0         1         2         3
0  1.993081  0.152272  0.668028  0.251798
1  2.289843  0.840904  0.734763  1.558692
2  1.639470  1.561671  0.780086  2.040554
3  1.703188  3.163751  1.459251  2.578217
4  1.395551  0.506156  0.380774  0.262559


In [27]:
# 使用apply应用行或列数据
#f = lambda x : x.max()
def f(x):
    return x.max()

print(df.apply(f))

0   -1.395551
1    0.152272
2   -0.380774
3    0.262559
dtype: float64


In [25]:
# 指定轴方向
print(df.apply(lambda x : x.max(), axis=1))

0    0.152272
1   -0.734763
2   -0.780086
3   -1.459251
4    0.262559
dtype: float64


In [None]:
# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))

* 排序

In [28]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)

2    10
3    11
0    12
3    13
4    14
dtype: int32


In [30]:
# 索引排序
s4.sort_index(ascending=False)

4    14
3    13
3    11
2    10
0    12
dtype: int32

In [32]:
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)

          3         3         0         2
2 -0.291998  0.312256 -0.240259  0.242550
0  0.782116  2.009527  2.286111  1.720111
0  1.024127  2.458063 -1.683704 -0.671771


In [33]:
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)

Unnamed: 0,0,2,3,3.1
2,-0.240259,0.24255,-0.291998,0.312256
0,2.286111,1.720111,0.782116,2.009527
0,-1.683704,-0.671771,1.024127,2.458063


In [28]:
# 按值排序
# df4.sort_values(by=0)
help(pd.DataFrame.sort_values)

Help on function sort_values in module pandas.core.frame:

sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
    Sort by the values along either axis
    
    .. versionadded:: 0.17.0
    
    Parameters
    ----------
    by : str or list of str
        Name or list of names which refer to the axis items.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Axis to direct sorting
    ascending : bool or list of bool, default True
         Sort ascending vs. descending. Specify list for multiple sort
         orders.  If this is a list of bools, must match the length of
         the by.
    inplace : bool, default False
         if True, perform operation in-place
    kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
         Choice of sorting algorithm. See also ndarray.np.sort for more
         information.  `mergesort` is the only stable algorithm. For
         DataFrames, this option is only applied when sort

* 处理缺失数据

In [None]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

In [None]:
# isnull
df_data.isnull()

In [None]:
# dropna
df_data.dropna()
#df_data.dropna(axis=1)

In [None]:
# fillna
df_data.fillna(-100.)