## 基础处理部分

- Pandas为什么用？
    - 核心数据结构
        - DataFrame
        - Panel
        - Series
    - 基本操作
    - 运算
    - 画图
    - 文件的存储与读取

In [2]:
import numpy as np

# 创建一个符合正态分布的10个股票5天的涨幅数据
stock_change = np.random.normal(0, 1, (10, 5))

In [3]:
stock_change

array([[-1.10133108, -0.45264045,  0.04896366, -1.45817809, -0.66585844],
       [-0.55865509,  0.94867983,  0.72347952,  0.95251442,  1.21712382],
       [ 0.18506051, -0.29673547, -0.84917406,  2.0491795 , -0.43411394],
       [ 0.1023863 , -0.50544421,  0.10518903, -0.33421259, -1.11901822],
       [-0.21151869, -0.34132537,  0.87202935, -0.77328705, -0.08528794],
       [ 0.19695036, -1.48562947,  0.10914836,  0.2070094 , -2.21089279],
       [-1.60466388, -0.43091273, -0.18268187,  0.28052723, -0.20018938],
       [-0.20398917,  0.76353885, -0.05621198,  0.23922946, -0.50990146],
       [-0.72279933, -1.53008923, -1.46388131,  0.03981203,  1.7355171 ],
       [ 1.06056249,  1.38464048, -0.92652397, -1.22024321, -0.01187094]])

In [4]:
import pandas as pd

pd.DataFrame(stock_change)

Unnamed: 0,0,1,2,3,4
0,-1.101331,-0.45264,0.048964,-1.458178,-0.665858
1,-0.558655,0.94868,0.72348,0.952514,1.217124
2,0.185061,-0.296735,-0.849174,2.04918,-0.434114
3,0.102386,-0.505444,0.105189,-0.334213,-1.119018
4,-0.211519,-0.341325,0.872029,-0.773287,-0.085288
5,0.19695,-1.485629,0.109148,0.207009,-2.210893
6,-1.604664,-0.430913,-0.182682,0.280527,-0.200189
7,-0.203989,0.763539,-0.056212,0.239229,-0.509901
8,-0.722799,-1.530089,-1.463881,0.039812,1.735517
9,1.060562,1.38464,-0.926524,-1.220243,-0.011871


In [5]:
# 添加行索引
stock_row = ["股票{}".format(i) for i in range(1, 11)]
stock_change_df =  pd.DataFrame(stock_change, index=stock_row)

In [6]:
stock_change_df

Unnamed: 0,0,1,2,3,4
股票1,-1.101331,-0.45264,0.048964,-1.458178,-0.665858
股票2,-0.558655,0.94868,0.72348,0.952514,1.217124
股票3,0.185061,-0.296735,-0.849174,2.04918,-0.434114
股票4,0.102386,-0.505444,0.105189,-0.334213,-1.119018
股票5,-0.211519,-0.341325,0.872029,-0.773287,-0.085288
股票6,0.19695,-1.485629,0.109148,0.207009,-2.210893
股票7,-1.604664,-0.430913,-0.182682,0.280527,-0.200189
股票8,-0.203989,0.763539,-0.056212,0.239229,-0.509901
股票9,-0.722799,-1.530089,-1.463881,0.039812,1.735517
股票10,1.060562,1.38464,-0.926524,-1.220243,-0.011871


In [7]:
# 添加列索引
date = pd.date_range("20240101", periods=5, freq="B")
data =  pd.DataFrame(stock_change, index=stock_row, columns=date)
data

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05
股票1,-1.101331,-0.45264,0.048964,-1.458178,-0.665858
股票2,-0.558655,0.94868,0.72348,0.952514,1.217124
股票3,0.185061,-0.296735,-0.849174,2.04918,-0.434114
股票4,0.102386,-0.505444,0.105189,-0.334213,-1.119018
股票5,-0.211519,-0.341325,0.872029,-0.773287,-0.085288
股票6,0.19695,-1.485629,0.109148,0.207009,-2.210893
股票7,-1.604664,-0.430913,-0.182682,0.280527,-0.200189
股票8,-0.203989,0.763539,-0.056212,0.239229,-0.509901
股票9,-0.722799,-1.530089,-1.463881,0.039812,1.735517
股票10,1.060562,1.38464,-0.926524,-1.220243,-0.011871


### DataFrame的属性
-   index
-   columns
-   values
-   shape
-   size
-   dtypes
-   ndim
-   T

In [8]:
data.shape

(10, 5)

In [9]:
data.index

Index(['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10'], dtype='object')

In [10]:
data.columns

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05'],
              dtype='datetime64[ns]', freq='B')

In [11]:
data.values

array([[-1.10133108, -0.45264045,  0.04896366, -1.45817809, -0.66585844],
       [-0.55865509,  0.94867983,  0.72347952,  0.95251442,  1.21712382],
       [ 0.18506051, -0.29673547, -0.84917406,  2.0491795 , -0.43411394],
       [ 0.1023863 , -0.50544421,  0.10518903, -0.33421259, -1.11901822],
       [-0.21151869, -0.34132537,  0.87202935, -0.77328705, -0.08528794],
       [ 0.19695036, -1.48562947,  0.10914836,  0.2070094 , -2.21089279],
       [-1.60466388, -0.43091273, -0.18268187,  0.28052723, -0.20018938],
       [-0.20398917,  0.76353885, -0.05621198,  0.23922946, -0.50990146],
       [-0.72279933, -1.53008923, -1.46388131,  0.03981203,  1.7355171 ],
       [ 1.06056249,  1.38464048, -0.92652397, -1.22024321, -0.01187094]])

In [12]:
data.T

Unnamed: 0,股票1,股票2,股票3,股票4,股票5,股票6,股票7,股票8,股票9,股票10
2024-01-01,-1.101331,-0.558655,0.185061,0.102386,-0.211519,0.19695,-1.604664,-0.203989,-0.722799,1.060562
2024-01-02,-0.45264,0.94868,-0.296735,-0.505444,-0.341325,-1.485629,-0.430913,0.763539,-1.530089,1.38464
2024-01-03,0.048964,0.72348,-0.849174,0.105189,0.872029,0.109148,-0.182682,-0.056212,-1.463881,-0.926524
2024-01-04,-1.458178,0.952514,2.04918,-0.334213,-0.773287,0.207009,0.280527,0.239229,0.039812,-1.220243
2024-01-05,-0.665858,1.217124,-0.434114,-1.119018,-0.085288,-2.210893,-0.200189,-0.509901,1.735517,-0.011871


In [13]:
data.head()

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05
股票1,-1.101331,-0.45264,0.048964,-1.458178,-0.665858
股票2,-0.558655,0.94868,0.72348,0.952514,1.217124
股票3,0.185061,-0.296735,-0.849174,2.04918,-0.434114
股票4,0.102386,-0.505444,0.105189,-0.334213,-1.119018
股票5,-0.211519,-0.341325,0.872029,-0.773287,-0.085288


In [14]:
data.tail()

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05
股票6,0.19695,-1.485629,0.109148,0.207009,-2.210893
股票7,-1.604664,-0.430913,-0.182682,0.280527,-0.200189
股票8,-0.203989,0.763539,-0.056212,0.239229,-0.509901
股票9,-0.722799,-1.530089,-1.463881,0.039812,1.735517
股票10,1.060562,1.38464,-0.926524,-1.220243,-0.011871


In [15]:
# data.index[2] = "股票88" 不能单独修改索引

# 修改需要重新构造一组索引
stock_ = ["股票_{}".format(i) for i in range(1, 11)]
data.index = stock_
data

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05
股票_1,-1.101331,-0.45264,0.048964,-1.458178,-0.665858
股票_2,-0.558655,0.94868,0.72348,0.952514,1.217124
股票_3,0.185061,-0.296735,-0.849174,2.04918,-0.434114
股票_4,0.102386,-0.505444,0.105189,-0.334213,-1.119018
股票_5,-0.211519,-0.341325,0.872029,-0.773287,-0.085288
股票_6,0.19695,-1.485629,0.109148,0.207009,-2.210893
股票_7,-1.604664,-0.430913,-0.182682,0.280527,-0.200189
股票_8,-0.203989,0.763539,-0.056212,0.239229,-0.509901
股票_9,-0.722799,-1.530089,-1.463881,0.039812,1.735517
股票_10,1.060562,1.38464,-0.926524,-1.220243,-0.011871


In [16]:
# 重设索引
data.reset_index(drop=False) # drop=True把之前的索引删除

Unnamed: 0,index,2024-01-01 00:00:00,2024-01-02 00:00:00,2024-01-03 00:00:00,2024-01-04 00:00:00,2024-01-05 00:00:00
0,股票_1,-1.101331,-0.45264,0.048964,-1.458178,-0.665858
1,股票_2,-0.558655,0.94868,0.72348,0.952514,1.217124
2,股票_3,0.185061,-0.296735,-0.849174,2.04918,-0.434114
3,股票_4,0.102386,-0.505444,0.105189,-0.334213,-1.119018
4,股票_5,-0.211519,-0.341325,0.872029,-0.773287,-0.085288
5,股票_6,0.19695,-1.485629,0.109148,0.207009,-2.210893
6,股票_7,-1.604664,-0.430913,-0.182682,0.280527,-0.200189
7,股票_8,-0.203989,0.763539,-0.056212,0.239229,-0.509901
8,股票_9,-0.722799,-1.530089,-1.463881,0.039812,1.735517
9,股票_10,1.060562,1.38464,-0.926524,-1.220243,-0.011871


In [17]:
# 设置新索引
df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale':[55, 40, 84, 31]})
df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [18]:
# 以月份设置新的索引
df.set_index("month", drop=True)

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2012,55
4,2014,40
7,2013,84
10,2014,31


In [19]:
# 设置多个索引，以年和月份
new_df = df.set_index(["year", "month"])
new_df

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
year,month,Unnamed: 2_level_1
2012,1,55
2014,4,40
2013,7,84
2014,10,31


In [20]:
new_df.index

MultiIndex([(2012,  1),
            (2014,  4),
            (2013,  7),
            (2014, 10)],
           names=['year', 'month'])

### MultiIndex
多级或分层索引对象
- index属性
    - names:levels的名称
    - levels:每个level的元组值

In [21]:
new_df.index.names

FrozenList(['year', 'month'])

In [22]:
new_df.index.levels

FrozenList([[2012, 2013, 2014], [1, 4, 7, 10]])

## Series
- 带索引的一维数组
- 属性
    - index
    - values


> 总结：DataFrame 是Series 的容器，Panel 是 DataFrame 的容器(Panel于新版本弃用)

In [23]:
# 创建
pd.Series(np.arange(3, 9, 2), index=["a", "b", "c"])

a    3
b    5
c    7
dtype: int32

In [24]:
# or
pd.Series({'red':100, 'blue':200, 'green':500, 'yellow':1000})

red        100
blue       200
green      500
yellow    1000
dtype: int64

In [25]:
sr =data.iloc[1, :]

In [26]:
sr.index

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05'],
              dtype='datetime64[ns]', freq='B')

In [27]:
sr.values

array([-0.55865509,  0.94867983,  0.72347952,  0.95251442,  1.21712382])