## 1 Pandas基础

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 数据读取
data = pd.read_csv('./test.csv')
data.head()    # head默认查看前5行，可以指定其他值

Unnamed: 0,date,temperature,load
0,12/08/19,25.0,1031
1,12/09/19,22.0,1081
2,12/10/19,27.0,1063
3,12/11/19,28.0,1090
4,12/12/19,21.0,1005


In [3]:
data.describe()

Unnamed: 0,temperature,load
count,10.0,11.0
mean,22.0,1051.272727
std,6.51494,27.463033
min,11.0,1005.0
25%,19.5,1026.0
50%,22.0,1063.0
75%,26.5,1066.0
max,32.0,1090.0


In [4]:
# 访问dataframe的组件
columns = data.columns    # 列名称索引
index = data.index        # 提取行索引
np_data = data.values     # 将dataframe转成numpy array

In [5]:
columns

Index(['date', 'temperature', 'load'], dtype='object')

In [6]:
index

RangeIndex(start=0, stop=11, step=1)

In [7]:
np_data[0], type(np_data)

(array(['12/08/19', 25.0, 1031], dtype=object), numpy.ndarray)

In [8]:
# 查看dataframe中的数据类型
data.dtypes

date            object
temperature    float64
load             int64
dtype: object

In [9]:
data.get_dtype_counts()

float64    1
int64      1
object     1
dtype: int64

In [10]:
# 选择某列，得到Series （Series相当于numpy array）
load = data['load']
print(type(load))
load[1:3]

<class 'pandas.core.series.Series'>


1    1081
2    1063
Name: load, dtype: int64

In [11]:
# 将Series转成dataframe
load_df = load.to_frame()      # dataframe表示一个表格
load_df.head()

Unnamed: 0,load
0,1031
1,1081
2,1063
3,1090
4,1005


In [12]:
load.value_counts()   # value_counts只针对Series有效，统计数量

1021    2
1066    2
1055    1
1005    1
1081    1
1031    1
1065    1
1063    1
1090    1
Name: load, dtype: int64

In [13]:
load.value_counts(normalize=True)   #归一化，返回频率

1021    0.181818
1066    0.181818
1055    0.090909
1005    0.090909
1081    0.090909
1031    0.090909
1065    0.090909
1063    0.090909
1090    0.090909
Name: load, dtype: float64

In [14]:
load.size, load.shape, len(load)

(11, (11,), 11)

In [15]:
load.min(), load.max(), load.mean(),\
load.median(), load.std(), load.sum(),\
load.count() # 统计非空值的个数

(1005, 1090, 1051.2727272727273, 1063.0, 27.46303300471712, 11564, 11)

In [16]:
load.isnull().head()    # 判断是否为空

0    False
1    False
2    False
3    False
4    False
Name: load, dtype: bool

In [17]:
# 缺失值会填成np.NAN
data

Unnamed: 0,date,temperature,load
0,12/08/19,25.0,1031
1,12/09/19,22.0,1081
2,12/10/19,27.0,1063
3,12/11/19,28.0,1090
4,12/12/19,21.0,1005
5,12/13/19,22.0,1066
6,12/14/19,13.0,1021
7,12/15/19,32.0,1021
8,12/16/19,19.0,1055
9,12/17/19,11.0,1066


In [18]:
fill_data = data.fillna(0)
fill_data

Unnamed: 0,date,temperature,load
0,12/08/19,25.0,1031
1,12/09/19,22.0,1081
2,12/10/19,27.0,1063
3,12/11/19,28.0,1090
4,12/12/19,21.0,1005
5,12/13/19,22.0,1066
6,12/14/19,13.0,1021
7,12/15/19,32.0,1021
8,12/16/19,19.0,1055
9,12/17/19,11.0,1066


In [19]:
drop_data = data.dropna()
drop_data

Unnamed: 0,date,temperature,load
0,12/08/19,25.0,1031
1,12/09/19,22.0,1081
2,12/10/19,27.0,1063
3,12/11/19,28.0,1090
4,12/12/19,21.0,1005
5,12/13/19,22.0,1066
6,12/14/19,13.0,1021
7,12/15/19,32.0,1021
8,12/16/19,19.0,1055
9,12/17/19,11.0,1066


In [20]:
data['temperature'].hasnans   # 对Series判断是否含缺失值

True

In [21]:
# 在Series上做运算
tmp = data['load']
tmp.head()

0    1031
1    1081
2    1063
3    1090
4    1005
Name: load, dtype: int64

In [22]:
tmp + 1000    # 算术运算 + - * / //

0     2031
1     2081
2     2063
3     2090
4     2005
5     2066
6     2021
7     2021
8     2055
9     2066
10    2065
Name: load, dtype: int64

In [23]:
tmp > 3000    # 逻辑运算 

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
Name: load, dtype: bool

In [24]:
# Series串联运算
data['temperature'].fillna(0)\
                   .astype(float)\
                   .head()

0    25.0
1    22.0
2    27.0
3    28.0
4    21.0
Name: temperature, dtype: float64

In [25]:
# 设置索引
tmp = data.set_index('load')
tmp
# 该操作等价于在读取数据时指定index_col
# pd.read_csv('./test.csv', index_col='load')

Unnamed: 0_level_0,date,temperature
load,Unnamed: 1_level_1,Unnamed: 2_level_1
1031,12/08/19,25.0
1081,12/09/19,22.0
1063,12/10/19,27.0
1090,12/11/19,28.0
1005,12/12/19,21.0
1066,12/13/19,22.0
1021,12/14/19,13.0
1021,12/15/19,32.0
1055,12/16/19,19.0
1066,12/17/19,11.0


In [26]:
# Series或columns转成list
index = data.index
index.tolist()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [27]:
data.load.tolist()

[1031, 1081, 1063, 1090, 1005, 1066, 1021, 1021, 1055, 1066, 1065]

In [28]:
data.shape

(11, 3)

In [29]:
# 创建列
data['123'] = 0
data.head()

Unnamed: 0,date,temperature,load,123
0,12/08/19,25.0,1031,0
1,12/09/19,22.0,1081,0
2,12/10/19,27.0,1063,0
3,12/11/19,28.0,1090,0
4,12/12/19,21.0,1005,0


In [30]:
data['123'] = 2   # 重新赋值
data.head()

Unnamed: 0,date,temperature,load,123
0,12/08/19,25.0,1031,2
1,12/09/19,22.0,1081,2
2,12/10/19,27.0,1063,2
3,12/11/19,28.0,1090,2
4,12/12/19,21.0,1005,2


In [31]:
data.drop('123', axis=1).head()
# 等价于 data.drop('123', axis='columns').head()

Unnamed: 0,date,temperature,load
0,12/08/19,25.0,1031
1,12/09/19,22.0,1081
2,12/10/19,27.0,1063
3,12/11/19,28.0,1090
4,12/12/19,21.0,1005


In [32]:
# 在指定位置插入列
gab_index = data.columns.get_loc('load') + 1
data.insert(loc=gab_index, column='error', value=data.temperature*data.load)
data.head()

Unnamed: 0,date,temperature,load,error,123
0,12/08/19,25.0,1031,25775.0,2
1,12/09/19,22.0,1081,23782.0,2
2,12/10/19,27.0,1063,28701.0,2
3,12/11/19,28.0,1090,30520.0,2
4,12/12/19,21.0,1005,21105.0,2
