In [1]:
# -*- encoding: utf-8 -*-
import numpy as np
import pandas as pd

# 定义索引信息
index = pd.Index(data=["zhangsan", "lishi", "wangwu", "zhaoliu", "wanger"])
data = {
    "age": [22, 17, np.nan, 16, 25],
    "address": ["nj", None, "nj", "sh", "bj"],
    "salary": [2200, 1700, 1600, np.nan, 2500],
}
# 构建DataFrame1:数据完整，提供索引
base_info = pd.DataFrame(data=data, index=index)
base_info['gender'] = ['f', 'm', 'f', 'm', 'f']
print(base_info)
print('*' * 60)


           age address  salary gender
zhangsan  22.0      nj  2200.0      f
lishi     17.0    None  1700.0      m
wangwu     NaN      nj  1600.0      f
zhaoliu   16.0      sh     NaN      m
wanger    25.0      bj  2500.0      f
************************************************************


In [2]:
# 统计函数：具体公式参考《概数》
# 年龄与收入协方差
print(base_info["age"].cov(base_info["salary"]))


1633.3333333333333


In [3]:
# 排名
print(base_info["salary"].rank())
print(base_info["salary"].rank(method="first"))


zhangsan    3.0
lishi       2.0
wangwu      1.0
zhaoliu     NaN
wanger      4.0
Name: salary, dtype: float64
zhangsan    3.0
lishi       2.0
wangwu      1.0
zhaoliu     NaN
wanger      4.0
Name: salary, dtype: float64


In [4]:
data = {
    "turnover": [12000, 18000, np.nan, 12000, 9000, 16000, 18000],
    "date": pd.date_range("2018-07-01", periods=7)
}
df2 = pd.DataFrame(data=data)
print(df2)
print('*' * 60)


   turnover       date
0   12000.0 2018-07-01
1   18000.0 2018-07-02
2       NaN 2018-07-03
3   12000.0 2018-07-04
4    9000.0 2018-07-05
5   16000.0 2018-07-06
6   18000.0 2018-07-07
************************************************************


In [5]:
# 解决缺失值:丢失、填充，min_periods参数指定遇到缺失值时取非缺失值数据的最少的行数
print(df2.rolling(window=2, on="date", min_periods=1).sum())


   turnover       date
0   12000.0 2018-07-01
1   30000.0 2018-07-02
2   18000.0 2018-07-03
3   12000.0 2018-07-04
4   21000.0 2018-07-05
5   25000.0 2018-07-06
6   34000.0 2018-07-07


In [6]:
# 扩展，时间段的统计
line = len(df2)
print(line)
print(df2.rolling(window=line, on="date", min_periods=1).sum())
# 另外一种形式，时间段的统计
print(df2.expanding(min_periods=1)["turnover"].sum())
# rolling和expanding函数支持的统计函数：
# count、sum、mean、min、max、std、var、skew、median……
print('*' * 60)


7
   turnover       date
0   12000.0 2018-07-01
1   30000.0 2018-07-02
2   30000.0 2018-07-03
3   42000.0 2018-07-04
4   51000.0 2018-07-05
5   67000.0 2018-07-06
6   85000.0 2018-07-07
0    12000.0
1    30000.0
2    30000.0
3    42000.0
4    51000.0
5    67000.0
6    85000.0
Name: turnover, dtype: float64
************************************************************
