## Pandas基础 01


In [2]:
import warnings

In [3]:
warnings.filterwarnings("ignore")

In [1]:
import numpy as np
import pandas as pd

### 01 数据结构 Series

Series 是带有标签的一维数组，可以保存任何数据类型（整数，字符串，浮点数，Python对象等）,轴标签统称为索引

In [7]:
s = pd.Series(np.random.rand(5))

In [9]:
print(s)

0    0.314839
1    0.952569
2    0.767409
3    0.581640
4    0.643048
dtype: float64


In [12]:
print(type(s))

<class 'pandas.core.series.Series'>


In [14]:
print(s.index,type(s.index))

RangeIndex(start=0, stop=5, step=1) <class 'pandas.core.indexes.range.RangeIndex'>


In [16]:
print(s.values,type(s.values))

[0.31483888 0.95256939 0.76740901 0.58163991 0.64304793] <class 'numpy.ndarray'>


【**笔记**】

+ series相比于ndarray，是一个自带索引index的数组 → 一维数组 + 对应索引
+ 所以当只看series的值的时候，就是一个ndarray
+ series和ndarray较相似，索引切片功能差别不大
+ series和dict相比，series更像一个有顺序的字典（dict本身不存在顺序），其索引原理与字典相似（一个用key，一个用index）

#### Series的创建

method1 :字典创建，字典中的内容与其对应 key->index  value->value

In [18]:
dic = {'a':1 ,'b':2,'c':3}
s = pd.Series(dic)
print(s)

a    1
b    2
c    3
dtype: int64


In [19]:
dic2 = {'a':1 ,'b':'hello' , 'c':3, '4':4, '5':5}
ss = pd.Series(dic2)
print(ss)

a        1
b    hello
c        3
4        4
5        5
dtype: object


method2: 一维数组创建

In [20]:
arr = np.random.randn(5)
s = pd.Series(arr)

[ 2.67729885  0.9331024   1.51171995 -2.34931528  0.74816148]


In [21]:
print(arr)

[ 2.67729885  0.9331024   1.51171995 -2.34931528  0.74816148]


In [22]:
print(s)

0    2.677299
1    0.933102
2    1.511720
3   -2.349315
4    0.748161
dtype: float64


method3：由标量创建

In [23]:
s = pd.Series(10,index = range(4))

In [24]:
print(s)

0    10
1    10
2    10
3    10
dtype: int64


In [26]:
s1 = pd.Series(np.random.randn(5))

In [28]:
print(s1)

0   -0.525796
1   -0.404097
2   -1.374942
3    1.151221
4   -0.984049
dtype: float64


In [31]:
s2 = pd.Series(np.random.randn(5),name = 'test')

In [32]:
print(s2)

0   -0.662761
1   -0.452724
2    1.841427
3    0.465002
4   -1.201470
Name: test, dtype: float64


对比s1和s2，name为Series的一个参数，即创建一个数组的名称。


#### Series的索引

实际就是数据结构Series的索引，包括位置下标/标签索引/切片索引/布尔型索引

In [34]:
# 位置下标

s = pd.Series(np.random.rand(5))
print(s)
print(s[0],type(s[0]),s[0].dtype)
print(float(s[0]),type(float(s[0])))

0    0.455100
1    0.264361
2    0.830603
3    0.250908
4    0.303198
dtype: float64
0.45510026145066396 <class 'numpy.float64'> float64
0.45510026145066396 <class 'float'>


In [4]:
# 标签索引

s = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])
print(s)
print(s['a'],type(s['a']),s['a'].dtype)
# 方法类似下标索引，用[]表示，内写上index，注意index是字符串

sci = s[['a','b','e']]
print(sci,type(sci))
# 如果需要选择多个标签的值，用[[]]来表示（相当于[]中包含一个列表）
# 多标签索引结果是新的数组

a    0.092309
b    0.895398
c    0.034547
d    0.205735
e    0.725185
dtype: float64
0.09230915189898226 <class 'numpy.float64'> float64
a    0.092309
b    0.895398
e    0.725185
dtype: float64 <class 'pandas.core.series.Series'>


In [11]:
# 切片索引

s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])
print(s1[1:4])

1    0.889591
2    0.203752
3    0.661439
dtype: float64


In [12]:
s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = ['a','b','c','d','e'])
print(s1[1:4],s1[4])

1    0.491673
2    0.671560
3    0.215403
dtype: float64 0.9809169069537103


In [13]:
# 注意那个末端

In [8]:
print(s2['a':'c'],s2['c'])

a    0.272013
b    0.939355
c    0.517336
dtype: float64 0.5173364800417259


In [9]:
print(s2[0:3],s2[3])
# 注意：用index做切片是末端包含

a    0.272013
b    0.939355
c    0.517336
dtype: float64 0.11326109254905448


In [10]:
print(s2[:-1])
print(s2[::2])
# 下标索引做切片，和list写法一样

a    0.272013
b    0.939355
c    0.517336
d    0.113261
dtype: float64
a    0.272013
c    0.517336
e    0.669565
dtype: float64


In [16]:
# 布尔型索引

s = pd.Series(np.random.rand(3)*100)
s[4] = None  # 添加一个空值
print(s)
bs1 = s > 50
bs2 = s.isnull()
bs3 = s.notnull()


0    40.714665
1     73.09234
2    61.494283
4         None
dtype: object


In [17]:
print(bs1, type(bs1), bs1.dtype)
print(bs2, type(bs2), bs2.dtype)
print(bs3, type(bs3), bs3.dtype)
print('-----')
# 数组做判断之后，返回的是一个由布尔值组成的新的数组
# .isnull() / .notnull() 判断是否为空值 (None代表空值，NaN代表有问题的数值，两个都会识别为空值)

print(s[s > 50])
print(s[bs3])
# 布尔型索引方法：用[判断条件]表示，其中判断条件可以是 一个语句，或者是 一个布尔型数组！

0    False
1     True
2     True
4    False
dtype: bool <class 'pandas.core.series.Series'> bool
0    False
1    False
2    False
4     True
dtype: bool <class 'pandas.core.series.Series'> bool
0     True
1     True
2     True
4    False
dtype: bool <class 'pandas.core.series.Series'> bool
-----
1     73.09234
2    61.494283
dtype: object
0    40.714665
1     73.09234
2    61.494283
dtype: object


#### Series基本技巧

数据查看 / 重新索引 / 对齐 / 添加、修改、删除值

In [19]:
# 数据查看

s = pd.Series(np.random.rand(50))
print(s.head(10))
print(s.tail())
# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条

0    0.116239
1    0.499781
2    0.233781
3    0.670263
4    0.187421
5    0.219444
6    0.001195
7    0.577055
8    0.812759
9    0.135811
dtype: float64
45    0.610651
46    0.935571
47    0.659877
48    0.227119
49    0.202160
dtype: float64


In [20]:
# 重新索引reindex
# .reindex将会根据索引重新排序，如果当前索引不存在，则引入缺失值

s = pd.Series(np.random.rand(3), index = ['a','b','c'])
print(s)
s1 = s.reindex(['c','b','a','d'])
print(s1)
# .reindex()中也是写列表
# 这里'd'索引不存在，所以值为NaN

s2 = s.reindex(['c','b','a','d'], fill_value = 0)
print(s2)
# fill_value参数：填充缺失值的值

a    0.543052
b    0.557972
c    0.011613
dtype: float64
c    0.011613
b    0.557972
a    0.543052
d         NaN
dtype: float64
c    0.011613
b    0.557972
a    0.543052
d    0.000000
dtype: float64


In [21]:
# Series对齐

s1 = pd.Series(np.random.rand(3), index = ['Jack','Marry','Tom'])
s2 = pd.Series(np.random.rand(3), index = ['Wang','Jack','Marry'])
print(s1)
print(s2)
print(s1+s2)
# Series 和 ndarray 之间的主要区别是，Series 上的操作会根据标签自动对齐
# index顺序不会影响数值计算，以标签来计算
# 空值和任何值计算结果扔为空值

Jack     0.927868
Marry    0.576595
Tom      0.143847
dtype: float64
Wang     0.092869
Jack     0.700244
Marry    0.239781
dtype: float64
Jack     1.628112
Marry    0.816376
Tom           NaN
Wang          NaN
dtype: float64


In [22]:
# 删除：.drop

s = pd.Series(np.random.rand(5), index = list('ngjur'))
print(s)
s1 = s.drop('n')
s2 = s.drop(['g','j'])
print(s1)
print(s2)
print(s)
# drop 删除元素之后返回副本(inplace=False)

n    0.794806
g    0.191842
j    0.594468
u    0.569662
r    0.166968
dtype: float64
g    0.191842
j    0.594468
u    0.569662
r    0.166968
dtype: float64
n    0.794806
u    0.569662
r    0.166968
dtype: float64
n    0.794806
g    0.191842
j    0.594468
u    0.569662
r    0.166968
dtype: float64


In [23]:
# 添加

s1 = pd.Series(np.random.rand(5))
s2 = pd.Series(np.random.rand(5), index = list('ngjur'))
print(s1)
print(s2)
s1[5] = 100
s2['a'] = 100
print(s1)
print(s2)
print('-----')
# 直接通过下标索引/标签index添加值

s3 = s1.append(s2)
print(s3)
print(s1)
# 通过.append方法，直接添加一个数组
# .append方法生成一个新的数组，不改变之前的数组

0    0.806326
1    0.186228
2    0.344511
3    0.339121
4    0.329818
dtype: float64
n    0.415032
g    0.105471
j    0.497815
u    0.019201
r    0.757879
dtype: float64
0      0.806326
1      0.186228
2      0.344511
3      0.339121
4      0.329818
5    100.000000
dtype: float64
n      0.415032
g      0.105471
j      0.497815
u      0.019201
r      0.757879
a    100.000000
dtype: float64
-----
0      0.806326
1      0.186228
2      0.344511
3      0.339121
4      0.329818
5    100.000000
n      0.415032
g      0.105471
j      0.497815
u      0.019201
r      0.757879
a    100.000000
dtype: float64
0      0.806326
1      0.186228
2      0.344511
3      0.339121
4      0.329818
5    100.000000
dtype: float64


In [24]:
# 修改

s = pd.Series(np.random.rand(3), index = ['a','b','c'])
print(s)
s['a'] = 100
s[['b','c']] = 200
print(s)
# 通过索引直接修改，类似序列


a    0.920579
b    0.189254
c    0.194963
dtype: float64
a    100.0
b    200.0
c    200.0
dtype: float64


### 02 数据结构 Dataframe 

+ "二维数组"Dataframe：是一个表格型的数据结构，包含一组有序的列，其列的值类型可以是数值、字符串、布尔值等。


+ Dataframe中的数据以一个或多个二维块存放，不是列表、字典或一维数组结构。

In [25]:
# Dataframe 数据结构
# Dataframe是一个表格型的数据结构，“带有标签的二维数组”。
# Dataframe带有index（行标签）和columns（列标签）

data = {'name':['Jack','Tom','Mary'],
        'age':[18,19,20],
       'gender':['m','m','w']}
frame = pd.DataFrame(data)

In [26]:
print(frame)

   name  age gender
0  Jack   18      m
1   Tom   19      m
2  Mary   20      w


In [27]:
print(type(frame))

<class 'pandas.core.frame.DataFrame'>


In [28]:
print(frame.index,'\n该数据类型为：',type(frame.index))
print(frame.columns,'\n该数据类型为：',type(frame.columns))
print(frame.values,'\n该数据类型为：',type(frame.values))

RangeIndex(start=0, stop=3, step=1) 
该数据类型为： <class 'pandas.core.indexes.range.RangeIndex'>
Index(['name', 'age', 'gender'], dtype='object') 
该数据类型为： <class 'pandas.core.indexes.base.Index'>
[['Jack' 18 'm']
 ['Tom' 19 'm']
 ['Mary' 20 'w']] 
该数据类型为： <class 'numpy.ndarray'>


#### Dataframe的创建

method1 : 由数组、list组成的字典

In [29]:
data1 = {'a':[1,2,3],
        'b':[3,4,5],
        'c':[5,6,7]}
print(data1)

{'a': [1, 2, 3], 'b': [3, 4, 5], 'c': [5, 6, 7]}


In [30]:
data2 = {'one':np.random.rand(3),
        'two':np.random.rand(3)}
print(data2)

{'one': array([0.72589076, 0.02795186, 0.47533738]), 'two': array([0.96797203, 0.15516058, 0.51717338])}


In [31]:
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

   a  b  c
0  1  3  5
1  2  4  6
2  3  5  7
        one       two
0  0.725891  0.967972
1  0.027952  0.155161
2  0.475337  0.517173


In [32]:
df1 = pd.DataFrame(data1, columns = ['b','c','a','d'])
print(df1)

   b  c  a    d
0  3  5  1  NaN
1  4  6  2  NaN
2  5  7  3  NaN


In [33]:
df1 = pd.DataFrame(data1, columns = ['b','c'])
print(df1)

   b  c
0  3  5
1  4  6
2  5  7


columns参数：格式为list，如果现有数据中没有该列（比如'd'），则产生NaN值

如果columns重新指定时候，列的数量可以少于原数据

In [35]:
# 重新定义index，格式为list，长度必须保持一致

df2 = pd.DataFrame(data2, index = ['f1','f2','f3'])  # 这里如果尝试  index = ['f1','f2','f3','f4'] 会怎么样？
print(df2)

         one       two
f1  0.725891  0.967972
f2  0.027952  0.155161
f3  0.475337  0.517173


method2: 由Series组成的字典

In [37]:
data1 = {'one':pd.Series(np.random.rand(2)),
        'two':pd.Series(np.random.rand(3))}  # 没有设置index的Series
print(data1)

{'one': 0    0.479867
1    0.873099
dtype: float64, 'two': 0    0.154829
1    0.951907
2    0.123529
dtype: float64}


In [38]:
data2 = {'one':pd.Series(np.random.rand(2), index = ['a','b']),
        'two':pd.Series(np.random.rand(3),index = ['a','b','c'])}  # 设置了index的Series
print(data2)

{'one': a    0.777681
b    0.397571
dtype: float64, 'two': a    0.788676
b    0.964475
c    0.003853
dtype: float64}


In [39]:
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

        one       two
0  0.479867  0.154829
1  0.873099  0.951907
2       NaN  0.123529
        one       two
a  0.777681  0.788676
b  0.397571  0.964475
c       NaN  0.003853


+ 由Seris组成的字典 创建Dataframe，columns为字典key，index为Series的标签（如果Series没有指定标签，则是默认数字标签）

+ Series可以长度不一样，生成的Dataframe会出现NaN值

method3: 通过二维数组直接创建

In [40]:
ar = np.random.rand(9).reshape(3,3)
print(ar)

[[0.91267531 0.33060085 0.04661333]
 [0.36317646 0.74714315 0.65677234]
 [0.45403952 0.68846609 0.64304694]]


In [43]:
df1 = pd.DataFrame(ar)
print(df1)

          0         1         2
0  0.912675  0.330601  0.046613
1  0.363176  0.747143  0.656772
2  0.454040  0.688466  0.643047


In [44]:
df2 = pd.DataFrame(ar, index = ['a', 'b', 'c'], columns = ['one','two','three'])  # 可以尝试一下index或columns长度不等于已有数组的情况
print(df2)

        one       two     three
a  0.912675  0.330601  0.046613
b  0.363176  0.747143  0.656772
c  0.454040  0.688466  0.643047


method4: 由字典组成的列表

In [45]:
data = [{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]
print(data)

[{'one': 1, 'two': 2}, {'one': 5, 'two': 10, 'three': 20}]


In [46]:
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data, index = ['a','b'])
df3 = pd.DataFrame(data, columns = ['one','two'])
print(df1)
print(df2)
print(df3)

   one  two  three
0    1    2    NaN
1    5   10   20.0
   one  two  three
a    1    2    NaN
b    5   10   20.0
   one  two
0    1    2
1    5   10


method5 :由字典组成的字典

In [47]:
data = {'Jack':{'math':90,'english':89,'art':78},
       'Marry':{'math':82,'english':95,'art':92},
       'Tom':{'math':78,'english':67}}
df1 = pd.DataFrame(data)
print(df1)

         Jack  Marry   Tom
math       90     82  78.0
english    89     95  67.0
art        78     92   NaN


In [48]:
df2 = pd.DataFrame(data, columns = ['Jack','Tom','Bob'])
df3 = pd.DataFrame(data, index = ['a','b','c'])
print(df2)
print(df3)

         Jack   Tom  Bob
math       90  78.0  NaN
english    89  67.0  NaN
art        78   NaN  NaN
   Jack  Marry  Tom
a   NaN    NaN  NaN
b   NaN    NaN  NaN
c   NaN    NaN  NaN


In [49]:
# columns参数可以增加和减少现有列，如出现新的列，值为NaN
# index在这里和之前不同，并不能改变原有index，如果指向新的标签，值为NaN （非常重要！）

#### Dataframe的索引

Dataframe既有行索引也有列索引，可以被看做由Series组成的字典（共用一个索引）

选择列 / 选择行 / 切片 / 布尔判断

In [2]:
# 选择行与列

df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                   index = ['one','two','three'],
                   columns = ['a','b','c','d'])
print(df)

               a          b          c          d
one    86.666374  52.107573  14.960190  20.194788
two    98.814542  23.369987  35.698623  43.422784
three  14.768358  12.312797  14.851012  42.170000


In [6]:
data1 = df['a']
print(data1,type(data1))

one      86.666374
two      98.814542
three    14.768358
Name: a, dtype: float64 <class 'pandas.core.series.Series'>


**注： 按照列名选择列，只选择一列输出Series，选择多列输出Dataframe**

In [4]:
data2 = df[['a','c']]
print(data2,type(data2))

               a          c
one    86.666374  14.960190
two    98.814542  35.698623
three  14.768358  14.851012 <class 'pandas.core.frame.DataFrame'>


In [5]:
# 选择行

In [10]:
data3 = df.loc['one']
print(data2,type(data3))

               a          c
one    86.666374  14.960190
two    98.814542  35.698623
three  14.768358  14.851012 <class 'pandas.core.series.Series'>


In [8]:
data4 = df.loc[['one','two']]
print(data3,type(data4))

a    86.666374
b    52.107573
c    14.960190
d    20.194788
Name: one, dtype: float64 <class 'pandas.core.frame.DataFrame'>


In [11]:
# 选择列

In [12]:
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,
                   index = ['one','two','three'],
                   columns = ['a','b','c','d'])
print(df)

               a          b          c          d
one    27.644397  10.539288  23.454660  14.599124
two    85.672771  91.853922  12.725357  14.880208
three   8.433687  59.707479  47.901989  76.770330


In [14]:
data1 = df['a']
print(data1)

one      27.644397
two      85.672771
three     8.433687
Name: a, dtype: float64


注： df[]默认选择列，[]中写列名

In [15]:
# df.loc[] - 按index选择行 

In [22]:
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df1)
print('-----------------------------------------------------')
print(df2)

               a          b          c          d
one    52.120804   7.372969  84.210659  89.123187
two    66.300147  92.252027  88.810194  96.869046
three  30.914314  90.274615  95.834611  45.432602
four   85.687115  42.676543  44.675636  25.828348
-----------------------------------------------------
           a          b          c          d
0  60.922635   8.977961  41.384896  54.299280
1  57.064524  35.341301  48.379129   6.314973
2  31.981022  32.601077  73.657284  39.477255
3  69.001291  66.796767  90.374862  16.600172


In [24]:
data1 = df1.loc['one']
data2 = df2.loc[1]
print(data1)
print('--------------------------')
print(data2)

a    52.120804
b     7.372969
c    84.210659
d    89.123187
Name: one, dtype: float64
--------------------------
a    57.064524
b    35.341301
c    48.379129
d     6.314973
Name: 1, dtype: float64


In [28]:
data6 = df2.loc[1:3]
print(data6)

           a          b          c          d
1  57.064524  35.341301  48.379129   6.314973
2  31.981022  32.601077  73.657284  39.477255
3  69.001291  66.796767  90.374862  16.600172


In [29]:
# 布尔型索引
# 和Series原理相同

In [31]:
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)

               a          b          c          d
one    92.252871  76.862801  45.086171  39.848602
two    75.776145  65.875447  30.129789  55.600124
three  79.890361  77.905079  15.578123  70.682528
four   97.594618  89.987797  47.110704  42.403193


In [36]:
b1 = df < 20
print(b1,type(b1))
print('-----------------------------------------')
print(df[b1])

           a      b      c      d
one    False  False  False  False
two    False  False  False  False
three  False  False   True  False
four   False  False  False  False <class 'pandas.core.frame.DataFrame'>
-----------------------------------------
        a   b          c   d
one   NaN NaN        NaN NaN
two   NaN NaN        NaN NaN
three NaN NaN  15.578123 NaN
four  NaN NaN        NaN NaN


In [39]:
b2 = df['a'] > 50
print(b2,type(b2))
print('--------------------------------------------')
print(df[b2])  # 也可以书写为 df[df['a'] > 50]
print('---------------------------------------------')

one      True
two      True
three    True
four     True
Name: a, dtype: bool <class 'pandas.core.series.Series'>
--------------------------------------------
               a          b          c          d
one    92.252871  76.862801  45.086171  39.848602
two    75.776145  65.875447  30.129789  55.600124
three  79.890361  77.905079  15.578123  70.682528
four   97.594618  89.987797  47.110704  42.403193
---------------------------------------------


In [41]:
# 多重索引：比如同时索引行和列
# 先选择列再选择行 —— 相当于对于一个数据，先筛选字段，再选择数据量

In [42]:
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
print(df)

               a          b          c          d
one    70.935502  88.812694  30.854324  16.096996
two    11.634851  10.220804  70.427860  85.766037
three   9.308033  51.045388  28.332174  20.540751
four   98.671074  28.137303   8.558584  37.311170


In [43]:
print(df['a'].loc[['one','three']])   # 选择a列的one，three行

one      70.935502
three     9.308033
Name: a, dtype: float64


In [45]:
print(df[['b','c','d']].iloc[::2])
#选择b，c，d列的one，three行

               b          c          d
one    88.812694  30.854324  16.096996
three  51.045388  28.332174  20.540751


In [46]:
print(df[df['a'] < 50].iloc[:2])   
# 选择满足判断索引的前两行数据

               a          b          c          d
two    11.634851  10.220804  70.427860  85.766037
three   9.308033  51.045388  28.332174  20.540751


#### DataFrame基本技巧

+ 数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序

In [48]:
# 数据查看、转置

df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,
                   columns = ['a','b'])
print(df.head(2))
print('-----------------------------')
print(df.tail())
# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条

print(df.T)
# .T 转置

           a          b
0  71.606402  34.608073
1  97.664953  96.640189
-----------------------------
           a          b
3   6.921770  32.642198
4  75.596959  83.457750
5  92.834388  27.336828
6  28.463524   3.583658
7  60.447622  24.384674
           0          1          2          3          4          5  \
a  71.606402  97.664953  83.811585   6.921770  75.596959  92.834388   
b  34.608073  96.640189  82.381001  32.642198  83.457750  27.336828   

           6          7  
a  28.463524  60.447622  
b   3.583658  24.384674  


In [50]:
# 添加与修改

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df)

df['e'] = 10
df.loc[4] = 20
print(df)

           a          b          c          d
0  33.644492  38.093835  25.969211  97.417244
1  96.438622  31.915684  27.877477  18.169081
2  65.937267  86.672535  95.657034  54.072658
3  25.559000  84.935437  90.733421   0.582163
           a          b          c          d   e
0  33.644492  38.093835  25.969211  97.417244  10
1  96.438622  31.915684  27.877477  18.169081  10
2  65.937267  86.672535  95.657034  54.072658  10
3  25.559000  84.935437  90.733421   0.582163  10
4  20.000000  20.000000  20.000000  20.000000  20


In [52]:
# 删除  del / drop()

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df)

del df['a']
print(df)

           a          b          c          d
0  23.792019  84.521330  75.768658  13.320024
1  22.574855  81.461973   7.193157  41.336640
2  26.268877   2.697603  98.225363  72.909128
3  62.153761  26.903136  53.121309   6.358380
           b          c          d
0  84.521330  75.768658  13.320024
1  81.461973   7.193157  41.336640
2   2.697603  98.225363  72.909128
3  26.903136  53.121309   6.358380


In [53]:
print(df.drop(0))

           b          c          d
1  81.461973   7.193157  41.336640
2   2.697603  98.225363  72.909128
3  26.903136  53.121309   6.358380


In [54]:
print(df.drop([1,2]))

           b          c          d
0  84.521330  75.768658  13.320024
3  26.903136  53.121309   6.358380


In [55]:
print(df)

           b          c          d
0  84.521330  75.768658  13.320024
1  81.461973   7.193157  41.336640
2   2.697603  98.225363  72.909128
3  26.903136  53.121309   6.358380


In [56]:
# 对齐

df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
print(df1 + df2)
# DataFrame对象之间的数据自动按照列和索引（行标签）对齐

          A         B         C   D
0  0.843350 -0.050046  1.333332 NaN
1 -0.339286  0.828764 -1.307052 NaN
2  0.557021  0.951325  1.283861 NaN
3 -1.193624 -1.175859  1.047327 NaN
4 -1.523750 -0.430467 -0.115423 NaN
5  1.959141 -1.359097 -0.701307 NaN
6 -0.954429 -1.291686  2.013389 NaN
7       NaN       NaN       NaN NaN
8       NaN       NaN       NaN NaN
9       NaN       NaN       NaN NaN


In [57]:
# 排序1 - 按值排序 .sort_values
# 同样适用于Series

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df1)

           a          b          c          d
0  15.699211  57.382051  68.531862  27.622906
1  36.697559  75.187689  25.172645  24.916907
2  87.382660  17.152470  92.523605  77.527907
3  64.538470  38.507963  28.161702  66.102875


In [59]:
print('-----------------------升序------------------')
print(df1.sort_values(['a'], ascending = True))  # 升序
print('-----------------------降序-----------------')
print(df1.sort_values(['a'], ascending = False))  # 降序

-----------------------升序------------------
           a          b          c          d
0  15.699211  57.382051  68.531862  27.622906
1  36.697559  75.187689  25.172645  24.916907
3  64.538470  38.507963  28.161702  66.102875
2  87.382660  17.152470  92.523605  77.527907
-----------------------降序-----------------
           a          b          c          d
2  87.382660  17.152470  92.523605  77.527907
3  64.538470  38.507963  28.161702  66.102875
1  36.697559  75.187689  25.172645  24.916907
0  15.699211  57.382051  68.531862  27.622906


**ascending参数：设置升序降序，默认升序**

In [60]:
df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
                  'b':list(range(8)),
                  'c':list(range(8,0,-1))})
print(df2)

   a  b  c
0  1  0  8
1  1  1  7
2  1  2  6
3  1  3  5
4  2  4  4
5  2  5  3
6  2  6  2
7  2  7  1


In [61]:
print(df2.sort_values(['a','c']))
# 多列排序，按列顺序排序

   a  b  c
3  1  3  5
2  1  2  6
1  1  1  7
0  1  0  8
7  2  7  1
6  2  6  2
5  2  5  3
4  2  4  4


In [64]:
# 排序2 - 索引排序 .sort_index

df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  index = [5,4,3,2],
                   columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                  index = ['h','s','x','g'],
                   columns = ['a','b','c','d'])
print(df1)
print('------------------按索引排序------------------')
print(df1.sort_index())
print(df2)
print(df2.sort_index())
# 按照index排序
# 默认 ascending=True, inplace=False

           a          b          c          d
5  98.925322  37.946651  46.961225  29.832217
4  21.062018  95.184815  76.635587  30.953178
3  46.754811  55.276697   3.763255  95.706695
2  28.580988  26.254512  23.404527  52.248219
------------------按索引排序------------------
           a          b          c          d
2  28.580988  26.254512  23.404527  52.248219
3  46.754811  55.276697   3.763255  95.706695
4  21.062018  95.184815  76.635587  30.953178
5  98.925322  37.946651  46.961225  29.832217
           a          b          c          d
h   9.523900   2.736344  51.681189  51.986340
s  29.591364  21.192754  89.597373  30.115471
x  25.971183   1.390069  99.909598  26.281789
g  57.935728  81.019542  33.556127  42.709470
           a          b          c          d
g  57.935728  81.019542  33.556127  42.709470
h   9.523900   2.736344  51.681189  51.986340
s  29.591364  21.192754  89.597373  30.115471
x  25.971183   1.390069  99.909598  26.281789
