Pandas入门文档

## 生成对象

In [2]:
# 用值列表生成Series时, Pandas默认自动生成整数索引
import pandas as pd
import numpy as np
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [6]:
# 用含日期时间索引与标签的NumPy数组生成DataFrame
dates = pd.date_range("20130101", periods=6)
print(dates)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


In [10]:
# 生成6行4列的数据, 列索引为ABCD, 行索引为日期
df = pd.DataFrame(np.random.rand(6, 4), index=dates, columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
2013-01-01,0.193888,0.768638,0.9885,0.972023
2013-01-02,0.575941,0.528082,0.169029,0.290966
2013-01-03,0.75027,0.881129,0.063214,0.344896
2013-01-04,0.183613,0.226129,0.226496,0.547569
2013-01-05,0.374761,0.799724,0.746518,0.158145
2013-01-06,0.557271,0.89636,0.313586,0.529015


In [15]:
# 查看数据
# df.head(n)查看前n行数据, 默认是前五行
# df.tail(n)查看后n行数据, 默认是最后五行
print(df.head(2))
print(df.tail(1))

                   A         B         C         D
2013-01-01  0.193888  0.768638  0.988500  0.972023
2013-01-02  0.575941  0.528082  0.169029  0.290966
                   A        B         C         D
2013-01-06  0.557271  0.89636  0.313586  0.529015


In [18]:
# 矩阵转置
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.193888,0.575941,0.75027,0.183613,0.374761,0.557271
B,0.768638,0.528082,0.881129,0.226129,0.799724,0.89636
C,0.9885,0.169029,0.063214,0.226496,0.746518,0.313586
D,0.972023,0.290966,0.344896,0.547569,0.158145,0.529015


In [22]:
# 按轴排序, axis=0按照行名排序， axis=1按照列名排序
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.557271,0.89636,0.313586,0.529015
2013-01-05,0.374761,0.799724,0.746518,0.158145
2013-01-04,0.183613,0.226129,0.226496,0.547569
2013-01-03,0.75027,0.881129,0.063214,0.344896
2013-01-02,0.575941,0.528082,0.169029,0.290966
2013-01-01,0.193888,0.768638,0.9885,0.972023


In [24]:
# 按值排序
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.557271,0.89636,0.313586,0.529015
2013-01-03,0.75027,0.881129,0.063214,0.344896
2013-01-05,0.374761,0.799724,0.746518,0.158145
2013-01-01,0.193888,0.768638,0.9885,0.972023
2013-01-02,0.575941,0.528082,0.169029,0.290966
2013-01-04,0.183613,0.226129,0.226496,0.547569


## 按标签选择

`loc[]`是根据DataFrame的行标和列表进行数据的少选, 其接受的两个参数: 行标和列标， 当标被忽略时, 默认获取整行/列数据

In [35]:
df.loc[dates[0]]

A    0.193888
B    0.768638
C    0.988500
D    0.972023
Name: 2013-01-01 00:00:00, dtype: float64

In [36]:
# 用标签选择多列数据
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,0.193888,0.768638
2013-01-02,0.575941,0.528082
2013-01-03,0.75027,0.881129
2013-01-04,0.183613,0.226129
2013-01-05,0.374761,0.799724
2013-01-06,0.557271,0.89636


## 按位置选择

In [34]:
df.iloc[3]

Unnamed: 0,A,B,C,D
