# Pandas学习教程

**Author Github: @deeptrial**

2021-04-01

East China Normal University

In [33]:
import pandas as pd
import numpy as np
import math

In [None]:
def sperate():
    print("--------------------------------")

## 1. Pandas的三大数据结构

Pandas具有3种数据结构，分别是Series（系列），DataFrame（数据帧），Panel（面板）。三种数据结构均建立在numpy基础上，因此具有较好的执行效率。这些数据结构的特点如下:

- Series表示1维均匀数组，数组尺寸**不可变**，数据值可变
- DataFrame表示2维异构数组，数组尺寸可变，数据值可变
- ~~Panel表示表示3维异构数组，数组尺寸可变，数据值可变~~ 新版本已使用MultiIndex代替Panel

因此三种数据结构间的关系是：Series可组成DataFrame，DataFrame可组成Panel

### 1.1 Series

Pandas系列可以使用以下构造函数创建：

```python
pandas.Series( data, index, dtype, copy)
```
- data: 数据形式，可以是numpy array，list, dictionary， constant
- index：索引值，默认np.arange(n)
- dtype: 数据类型，默认将自动推断
- copy：表示是否复制数据，默认False

![image.png](https://pic1.zhimg.com/v2-3c3e48377d299270579be01f6d9fb79c_b.jpg)

#### 创建Series

In [89]:
# 创建Series
sp=pd.Series([11,3.5,2,5,12],index=['a','b','c','d',4])  #从list创建
print(sp)

s=pd.Series('',index=['a','b','c','d',4])   #从constant创建
print(s)

s=pd.Series({'a':1,'b':2,'c':3,'d':4})   #从dict创建
print(s)

a    11.0
b     3.5
c     2.0
d     5.0
4    12.0
dtype: float64
a    
b    
c    
d    
4    
dtype: object
a    1
b    2
c    3
d    4
dtype: int64


In [90]:
# 查看属性
print(s.count(),s.dtype,s.describe())

4 int64 count    4.000000
mean     2.500000
std      1.290994
min      1.000000
25%      1.750000
50%      2.500000
75%      3.250000
max      4.000000
dtype: float64


In [91]:
#获取、修改index
print(s.values,s.index)
s.index=['a','c','b','d']
print(s)

# 访问数据
print(s[0],s['a'],s[-1],s[:3],s[-3:])
print(s.a)
print(s[['a','b','d']])

# 修改数据
s[0]=6
print(s)
s[['a','b','d']]=7
print(s)

# 命名Series、index
s.name="no means"
s.index.name="myindex"
print(s)

[1 2 3 4] Index(['a', 'b', 'c', 'd'], dtype='object')
a    1
c    2
b    3
d    4
dtype: int64
1 1 4 a    1
c    2
b    3
dtype: int64 c    2
b    3
d    4
dtype: int64
1
a    1
b    3
d    4
dtype: int64
a    6
c    2
b    3
d    4
dtype: int64
a    7
c    2
b    7
d    7
dtype: int64
myindex
a    7
c    2
b    7
d    7
Name: no means, dtype: int64


In [92]:
# 运算
# bool运算
print(s>5)

# 四则运算
print(s+5)
print(s*2)

# 函数运算
print(np.exp(s))
# 无法使用math计算 print(math.exp(s))

#拼接
print(sp,s,s+sp) #相同index的累计，不同index的为NaN

# 条件判断
print(7 in s)

#按照index大小排序
print(s.sort_index())

myindex
a     True
c    False
b     True
d     True
Name: no means, dtype: bool
myindex
a    12
c     7
b    12
d    12
Name: no means, dtype: int64
myindex
a    14
c     4
b    14
d    14
Name: no means, dtype: int64
myindex
a    1096.633158
c       7.389056
b    1096.633158
d    1096.633158
Name: no means, dtype: float64
a    11.0
b     3.5
c     2.0
d     5.0
4    12.0
dtype: float64 myindex
a    7
c    2
b    7
d    7
Name: no means, dtype: int64 4     NaN
a    18.0
b    10.5
c     4.0
d    12.0
dtype: float64
False
myindex
a    7
b    7
c    2
d    7
Name: no means, dtype: int64


### 1.2 DataFrame

pandas中的DataFrame可以使用以下构造函数创建:
```python
pandas.DataFrame( data, index, columns, dtype, copy)
```
- data: 数据形式，可以是numpy array，list, map,dictionary， constant,dataframe
- index：行标签，默认np.arange(n)
- columns：列标签，默认np.arange(n)
- dtype: 数据类型，默认将自动推断
- copy：表示是否复制数据，默认False

![image.png](https://pic1.zhimg.com/v2-b03baaccf0ca7ec26c97a979fc6540f0_b.jpg)

In [130]:
# 创建DataFrame
df=pd.DataFrame([[1,'a'],[2,'b'],[3,'c'],[4,'d']],columns=['kk','dd'])
print(df)

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
print(df)

#列选择
print('column select:',df['one'])
print(df[['one','two']])

# 添加列
df["three"]=[1,2,3,4]
df['four']=pd.Series([10,20,30],index=['a','b','c'])
print(df)

#列删除
del df['four']
print(df)

df.pop('one')
print(df)

#行选择
print('select row',df.loc['a'],df.loc[['a'],['three','two']],type(df.loc['a']))
print('select row-2',df.iloc[0],df.iloc[:,0],df.iloc[[0,2],0],type(df.iloc[0]))
print(df[0:3])

#行添加
addrow=pd.DataFrame([[5,5],[6,6]],columns=['two','three'],index=['e','f'])
df=df.append(addrow) #注意需要赋值
print(df)

#行删除
df=df.drop('a') #注意与pop的区别
print(df)

   kk dd
0   1  a
1   2  b
2   3  c
3   4  d
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
column select: a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
   one  two  three  four
a  1.0    1      1  10.0
b  2.0    2      2  20.0
c  3.0    3      3  30.0
d  NaN    4      4   NaN
   one  two  three
a  1.0    1      1
b  2.0    2      2
c  3.0    3      3
d  NaN    4      4
   two  three
a    1      1
b    2      2
c    3      3
d    4      4
select row two      1
three    1
Name: a, dtype: int64    three  two
a      1    1 <class 'pandas.core.series.Series'>
select row-2 two      1
three    1
Name: a, dtype: int64 a    1
b    2
c    3
d    4
Name: two, dtype: int64 a    1
c    3
Name: two, dtype: int64 <class 'pandas.core.series.Series'>
   two  three
a    1      1
b    2      2
c    3      3
   two  three
a    1      1
b    2      2
c    3      3
d    4      4
e    5      5
f    6      6
   two  thre

In [131]:
# 查看属性
print(df.count(),df.dtypes,df.describe())   

two      5
three    5
dtype: int64 two      int64
three    int64
dtype: object             two     three
count  5.000000  5.000000
mean   4.000000  4.000000
std    1.581139  1.581139
min    2.000000  2.000000
25%    3.000000  3.000000
50%    4.000000  4.000000
75%    5.000000  5.000000
max    6.000000  6.000000


In [136]:
# 获取、修改index，columns
print(df.index,df.columns)
df.rename(columns={'two':'oo','three':'tt'},inplace=True)
df.index=np.arange(df.shape[0])
print(df)

#条件访问数据
choose= (df['oo']>=5)
print(df.loc[choose,:])

# 修改数据
df.loc[3,'oo']=10
df.iloc[3,1]=15
print(df)

Int64Index([0, 1, 2, 3, 4], dtype='int64') Index(['oo', 'tt'], dtype='object')
   oo  tt
0   2   2
1   3   3
2   4   4
3  10   5
4   6   6
   oo  tt
3  10   5
4   6   6
   oo  tt
0   2   2
1   3   3
2   4   4
3  10  15
4   6   6


### 1.3 ~~Panel~~ MultiIndex

Panel主要用于三维数据，但在实际中，高维数据常使用多层级索引MultiIndex表示，操作更加灵活，可以表示3维，甚至更高维的数据。因此在新版Pandas中，panel已被移除

In [147]:
p=pd.DataFrame(np.random.randint(50,100,size=(4,4)),columns=pd.MultiIndex.from_product([["math","physics"],["term1","term2"]]),index=pd.MultiIndex.from_tuples([("class1","LiLei"),("class1","HanMeimei"),("class2","DaChun"),("class2","RuHua")]))
print(p)

p.index.names=["class","names"]
print(p)

# MultiIndex索引
print(p["math"]["term1"])
print(p.loc["class1"])

                  math       physics      
                 term1 term2   term1 term2
class1 LiLei        68    67      83    73
       HanMeimei    76    83      70    97
class2 DaChun       84    95      55    72
       RuHua        81    75      55    56
                  math       physics      
                 term1 term2   term1 term2
class  names                              
class1 LiLei        68    67      83    73
       HanMeimei    76    83      70    97
class2 DaChun       84    95      55    72
       RuHua        81    75      55    56
class   names    
class1  LiLei        68
        HanMeimei    76
class2  DaChun       84
        RuHua        81
Name: term1, dtype: int32
           math       physics      
          term1 term2   term1 term2
names                              
LiLei        68    67      83    73
HanMeimei    76    83      70    97
