# Pandas

In [1]:
import pandas as pd
import numpy as np

## 1 生成对象

In [2]:
# 用列表生成series对象
s = pd.Series([1,3,5,np.nan,7])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    7.0
dtype: float64

In [5]:
# 矩阵 - DataFrame
df2 = pd.DataFrame({
    'A':[1],
    'B':[pd.Timestamp('20240101')],
    'C':['xxx']
})
df2

Unnamed: 0,A,B,C
0,1,2024-01-01,xxx


In [6]:
# 用含日期的索引生成矩阵
d = pd.date_range('20240101', periods=6)
d

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df = pd.DataFrame(np.random.randn(6,4), index=d, columns=list('ABCD')) 
# np.random.randn(6,4) 用于生成6行4列的二维数组，每个数字iid~N(0,1)
df

Unnamed: 0,A,B,C,D
2024-01-01,-0.481355,0.743276,-0.026856,0.951764
2024-01-02,-0.349192,-0.951295,-1.01681,-1.419096
2024-01-03,0.77691,0.085999,-0.268158,-0.048869
2024-01-04,-1.100291,-0.380297,-0.784402,-1.813289
2024-01-05,0.968556,-0.996243,1.031896,0.953733
2024-01-06,-0.139715,-0.811698,0.655481,-2.787622


## 2 查看数据

In [9]:
df2.dtypes

A             int64
B    datetime64[ns]
C            object
dtype: object

In [10]:
df.head() # 前5行

Unnamed: 0,A,B,C,D
2024-01-01,-0.481355,0.743276,-0.026856,0.951764
2024-01-02,-0.349192,-0.951295,-1.01681,-1.419096
2024-01-03,0.77691,0.085999,-0.268158,-0.048869
2024-01-04,-1.100291,-0.380297,-0.784402,-1.813289
2024-01-05,0.968556,-0.996243,1.031896,0.953733


In [11]:
df.tail() # 后5行

Unnamed: 0,A,B,C,D
2024-01-02,-0.349192,-0.951295,-1.01681,-1.419096
2024-01-03,0.77691,0.085999,-0.268158,-0.048869
2024-01-04,-1.100291,-0.380297,-0.784402,-1.813289
2024-01-05,0.968556,-0.996243,1.031896,0.953733
2024-01-06,-0.139715,-0.811698,0.655481,-2.787622


In [13]:
df.index # 查看索引 （最左边一列）

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
df.columns 

Index(['A', 'B', 'C', 'D'], dtype='object')

In [15]:
df.describe() # 数据描述性统计

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.054181,-0.385043,-0.068142,-0.693897
std,0.7884,0.688658,0.798552,1.549296
min,-1.100291,-0.996243,-1.01681,-2.787622
25%,-0.448314,-0.916396,-0.655341,-1.714741
50%,-0.244453,-0.595998,-0.147507,-0.733983
75%,0.547754,-0.030575,0.484897,0.701606
max,0.968556,0.743276,1.031896,0.953733


## 3 排序

In [18]:
# 按列降序
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2024-01-01,0.951764,-0.026856,0.743276,-0.481355
2024-01-02,-1.419096,-1.01681,-0.951295,-0.349192
2024-01-03,-0.048869,-0.268158,0.085999,0.77691
2024-01-04,-1.813289,-0.784402,-0.380297,-1.100291
2024-01-05,0.953733,1.031896,-0.996243,0.968556
2024-01-06,-2.787622,0.655481,-0.811698,-0.139715


In [19]:
# 按值排序
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2024-01-05,0.968556,-0.996243,1.031896,0.953733
2024-01-02,-0.349192,-0.951295,-1.01681,-1.419096
2024-01-06,-0.139715,-0.811698,0.655481,-2.787622
2024-01-04,-1.100291,-0.380297,-0.784402,-1.813289
2024-01-03,0.77691,0.085999,-0.268158,-0.048869
2024-01-01,-0.481355,0.743276,-0.026856,0.951764


## 4 选择数据

In [20]:
df['A']

2024-01-01   -0.481355
2024-01-02   -0.349192
2024-01-03    0.776910
2024-01-04   -1.100291
2024-01-05    0.968556
2024-01-06   -0.139715
Freq: D, Name: A, dtype: float64

In [21]:
df[1:3]

Unnamed: 0,A,B,C,D
2024-01-02,-0.349192,-0.951295,-1.01681,-1.419096
2024-01-03,0.77691,0.085999,-0.268158,-0.048869


In [22]:
df['20240101':'20240104']

Unnamed: 0,A,B,C,D
2024-01-01,-0.481355,0.743276,-0.026856,0.951764
2024-01-02,-0.349192,-0.951295,-1.01681,-1.419096
2024-01-03,0.77691,0.085999,-0.268158,-0.048869
2024-01-04,-1.100291,-0.380297,-0.784402,-1.813289


### 4.1 按标签选择