# pandas 详解代码

## 导入模块 

In [1]:
import pandas as pd
import numpy as np

## Series函数相关知识

In [2]:
#使用列表建立Series
pd.Series([1, 2, 3, 4],index=['a','b','c','d'])

a    1
b    2
c    3
d    4
dtype: int64

In [3]:
#使用标量建立Series
pd.Series(1,index=['a'])

a    1
dtype: int64

In [4]:
#自定义索引可以是字符串符号等形式
pd.Series(3,index=['a','↑','王'])

a    3
↑    3
王    3
dtype: int64

In [5]:
#使用字典建立Series
pd.Series({'a':1,'↑':2,'王':3})

a    1
↑    2
王    3
dtype: int64

In [6]:
#使用ndarray建立Series'
X=np.arange(6,9)
X1=pd.Series(X)
print(X1,type(X1),type(X))

0    6
1    7
2    8
dtype: int32 <class 'pandas.core.series.Series'> <class 'numpy.ndarray'>


## DateFrame函数练习

In [7]:
#ndarray创建DataFrame
pd.DataFrame(np.arange(12).reshape(3,4))

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [8]:
#一维ndarray对象字典创建
X={
    'one':np.arange(3),
    'two':np.arange(4,7)
    }
pd.DataFrame(X)

Unnamed: 0,one,two
0,0,4
1,1,5
2,2,6


In [9]:
# 一维Series对象字典创建'
X={
    'one':pd.Series([1,2,3],index=['a','b','c']),
    'two':pd.Series([4,5,6,7],index=['b','c','d','e'])
    }

pd.DataFrame(X)

Unnamed: 0,one,two
a,1.0,
b,2.0,4.0
c,3.0,5.0
d,,6.0
e,,7.0


In [10]:
# 列表类型的字典创建'
X={
    'one':[0,1,2,3],
    'two':[4,5,6,7]
    }
pd.DataFrame(X,index=['b','c','d','e'])

Unnamed: 0,one,two
b,0,4
c,1,5
d,2,6
e,3,7


## DataFrame查看功能

In [11]:
data={
    '城市':['北京','上海','广州','深圳'],
    '环比':[101.5,102.2,101.3,102.0],
    '同比':[120.7,127.3,119.4,140.9],
    '定基':[121.4,127.8,120.0,145.5]
}
df=pd.DataFrame(data)
df

Unnamed: 0,城市,环比,同比,定基
0,北京,101.5,120.7,121.4
1,上海,102.2,127.3,127.8
2,广州,101.3,119.4,120.0
3,深圳,102.0,140.9,145.5


In [12]:
# 返回ndarray类型的对象
df.values

array([['北京', 101.5, 120.7, 121.4],
       ['上海', 102.2, 127.3, 127.8],
       ['广州', 101.3, 119.4, 120.0],
       ['深圳', 102.0, 140.9, 145.5]], dtype=object)

In [13]:
# 获取行索引
df.index

RangeIndex(start=0, stop=4, step=1)

In [14]:
# 获取列索引
df.columns

Index(['城市', '环比', '同比', '定基'], dtype='object')

In [15]:
# 获取行及列索引
df.axes

[RangeIndex(start=0, stop=4, step=1),
 Index(['城市', '环比', '同比', '定基'], dtype='object')]

In [16]:
# 行与列对调
df.T

Unnamed: 0,0,1,2,3
城市,北京,上海,广州,深圳
环比,101.5,102.2,101.3,102.0
同比,120.7,127.3,119.4,140.9
定基,121.4,127.8,120.0,145.5


In [17]:
# 打印DataFrame对象的信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   城市      4 non-null      object 
 1   环比      4 non-null      float64
 2   同比      4 non-null      float64
 3   定基      4 non-null      float64
dtypes: float64(3), object(1)
memory usage: 256.0+ bytes


In [18]:
# 显示前 i 行数据
i=2
df.head(i)

Unnamed: 0,城市,环比,同比,定基
0,北京,101.5,120.7,121.4
1,上海,102.2,127.3,127.8


In [19]:
# 显示后 i 行数据
i=2
df.tail(i)

Unnamed: 0,城市,环比,同比,定基
2,广州,101.3,119.4,120.0
3,深圳,102.0,140.9,145.5


In [20]:
# 查看数据按列的统计信息
df.describe()

Unnamed: 0,环比,同比,定基
count,4.0,4.0,4.0
mean,101.75,127.075,128.675
std,0.420317,9.844245,11.719322
min,101.3,119.4,120.0
25%,101.45,120.375,121.05
50%,101.75,124.0,124.6
75%,102.05,130.7,132.225
max,102.2,140.9,145.5


## Series和DataFrame索引操作

In [21]:
data1={
    '城市':['北京','上海','广州','深圳'],
    '环比':[101.5,102.2,101.3,102.0],
    '同比':[120.7,127.3,119.4,140.9],
    '定基':[121.4,127.8,120.0,145.5]
}
df1=pd.DataFrame(data,index=list('abad'))

df2=pd.DataFrame(np.random.rand(4,4),
                   index=list('xyaw'),
                   columns=list('一二三四'))
df1

Unnamed: 0,城市,环比,同比,定基
a,北京,101.5,120.7,121.4
b,上海,102.2,127.3,127.8
a,广州,101.3,119.4,120.0
d,深圳,102.0,140.9,145.5


In [22]:
df2

Unnamed: 0,一,二,三,四
x,0.124587,0.819667,0.753738,0.958755
y,0.153189,0.4539,0.542732,0.325584
a,0.82069,0.821039,0.193243,0.546474
w,0.412408,0.868929,0.873998,0.315354


In [23]:
# 判断'a'索引是否存在与df1的索引中
'a' in df1.index

True

In [24]:
# 将df2的索引对象粘贴到df1的索引后，产生一个新索引
df1.index.append(df2.index)

Index(['a', 'b', 'a', 'd', 'x', 'y', 'a', 'w'], dtype='object')

In [25]:
# 计算两个索引的差集\并集\交集
print('差集',df1.index.difference(df2.index))
print('并集',df1.index.union(df2.index))
print('交集',df1.index.intersection(df2.index))

差集 Index(['b', 'd'], dtype='object')
并集 Index(['a', 'a', 'b', 'd', 'w', 'x', 'y'], dtype='object')
交集 Index(['a'], dtype='object')


In [26]:
# 计算表示每一个值是否在列表中的布尔数组
df1.index.isin(list('ab'))

array([ True,  True,  True, False])

In [27]:
# 将位置i的元素删除，并产生新的索引
df1.index.delete(0)

Index(['b', 'a', 'd'], dtype='object')

In [28]:
# 根据传入参数删除指定索引，并产生新的索引
df1.index.drop('a')

Index(['b', 'd'], dtype='object')

In [29]:
# 将位置i插入元素，并产生新的索引
df1.index.insert(0,'aa')

Index(['aa', 'a', 'b', 'a', 'd'], dtype='object')

In [30]:
# 计算索引的唯一值序列
df1.index.unique()

Index(['a', 'b', 'd'], dtype='object')

## Series和DataFrame基础操作（一）

In [31]:
data={
    '城市':['北京','上海','广州','深圳'],
    '环比':[101.5,102.2,101.3,102.0],
    '同比':[120.7,127.3,119.4,140.9],
    '定基':[121.4,127.8,120.0,145.5]
}
df=pd.DataFrame(data,index=list('abcd'))
df

Unnamed: 0,城市,环比,同比,定基
a,北京,101.5,120.7,121.4
b,上海,102.2,127.3,127.8
c,广州,101.3,119.4,120.0
d,深圳,102.0,140.9,145.5


In [32]:
#在位置（loc）为3的地方添加一列数据，名字叫（column）'排名',值（values）为1，2，3，4
loc=3
column='排名'
value=[1,2,3,4]
df.insert(loc, column, value, allow_duplicates=False)
df

Unnamed: 0,城市,环比,同比,排名,定基
a,北京,101.5,120.7,1,121.4
b,上海,102.2,127.3,2,127.8
c,广州,101.3,119.4,3,120.0
d,深圳,102.0,140.9,4,145.5


In [33]:
#增加一个索引构成新索引
newi=df.index.insert(3,'kkk')
print('新行索引',newi)
#增加一列索引
newc=df.columns
print('新列索引',newc)
#按照新的行列索引构成新的DataFrame，填充值为99
ndf=df.reindex(index=newi,columns=newc,fill_value=99)
print('添加一行后的数据')
ndf

新行索引 Index(['a', 'b', 'c', 'kkk', 'd'], dtype='object')
新列索引 Index(['城市', '环比', '同比', '排名', '定基'], dtype='object')
添加一行后的数据


Unnamed: 0,城市,环比,同比,排名,定基
a,北京,101.5,120.7,1,121.4
b,上海,102.2,127.3,2,127.8
c,广州,101.3,119.4,3,120.0
kkk,99,99.0,99.0,99,99.0
d,深圳,102.0,140.9,4,145.5


In [34]:
#按照新的行列索引构成新的DataFrame，填充方法为向前填充
ndf=df.reindex(index=newi,columns=newc,method='ffill')
ndf

Unnamed: 0,城市,环比,同比,排名,定基
a,北京,101.5,120.7,1,121.4
b,上海,102.2,127.3,2,127.8
c,广州,101.3,119.4,3,120.0
kkk,深圳,102.0,140.9,4,145.5
d,深圳,102.0,140.9,4,145.5


## Series和DataFrame基础操作（二）

In [35]:
X={
    'one':[0,0,2,3],
    'two':[0,0,6,6]
    }
X=pd.DataFrame(X,index=['b','c','c','e'])
X

Unnamed: 0,one,two
b,0,0
c,0,0
c,2,6
e,3,6


In [36]:
# 删除一行
X.drop('c',axis=0)

Unnamed: 0,one,two
b,0,0
e,3,6


In [37]:
# 删除一列
X.drop('one',axis=1)

Unnamed: 0,two
b,0
c,0
c,6
e,6


## Series和DataFrame基础操作（三）

In [38]:
df1=pd.DataFrame(np.random.random((2,3)),
                 index=list('ab'),
                 columns=list('abc'))
df1

Unnamed: 0,a,b,c
a,0.409016,0.284512,0.851492
b,0.900158,0.139782,0.266547


In [39]:
df2=pd.DataFrame(np.random.random((2,3)),
                 index=list('bc'),
                 columns=list('bcd'))
df2

Unnamed: 0,b,c,d
b,0.022743,0.05409,0.570343
c,0.281833,0.77405,0.462031


In [40]:
#索引相同的位置的数据相加
df1+df2

Unnamed: 0,a,b,c,d
a,,,,
b,,0.162524,0.320636,
c,,,,


In [41]:
#判断df1中大于0.5的数
df1>0.5

Unnamed: 0,a,b,c
a,False,False,True
b,True,False,False


In [42]:
#将df1中大于0.5的数重新赋值为6
df1[df1>0.5]=6
df1

Unnamed: 0,a,b,c
a,0.409016,0.284512,6.0
b,6.0,0.139782,0.266547


## Series和DataFrame基础操作（四）

In [43]:
Sr1=pd.DataFrame(np.random.random(5),
                 index=list('aghzb'))
Sr1

Unnamed: 0,0
a,0.020828
g,0.114392
h,0.718178
z,0.983494
b,0.984531


In [44]:
#对索引排序
Sr1.sort_index(axis=0, ascending=True,)

Unnamed: 0,0
a,0.020828
b,0.984531
g,0.114392
h,0.718178
z,0.983494


In [45]:
#对索引排序，并选择降序
Sr1.sort_index(axis=0, ascending=False)

Unnamed: 0,0
z,0.983494
h,0.718178
g,0.114392
b,0.984531
a,0.020828


In [46]:

df = pd.DataFrame({
    'b':[1,2,3,2],
    'a':[4,3,2,1],
    'c':[1,3,8,2]},index=[2,0,1,3])

# 按b列升序排序
df.sort_values(by='b')
#等同于df.sort_values(by='b',axis=0))

Unnamed: 0,b,a,c
2,1,4,1
0,2,3,3
3,2,1,2
1,3,2,8


In [47]:
# 先按b列降序，再按a列升序排序
df.sort_values(by=['b','a'],axis=0,ascending=[False,True])

Unnamed: 0,b,a,c
1,3,2,8
3,2,1,2
0,2,3,3
2,1,4,1


In [48]:
# 按行3升序排列
#必须指定axis=1
df.sort_values(by=3,axis=1)

Unnamed: 0,a,b,c
2,4,1,1
0,3,2,3
1,2,3,8
3,1,2,2


In [49]:
# 按行3升序，行0降排列
df.sort_values(by=[3,0],axis=1,ascending=[True,False])

Unnamed: 0,a,c,b
2,4,1,1
0,3,3,2
1,2,8,3
3,1,2,2


In [50]:
# 根据数据的大小返回排名
Sr1.rank(axis=0,method='average',na_option='keep',ascending=True)

Unnamed: 0,0
a,1.0
g,2.0
h,3.0
z,4.0
b,5.0


## Series和DataFrame切片

In [51]:
s = pd.Series(list("acbdfe"), index=[49, 48, 47, 0, 1, 2])
print('构建的Series:\n',s)
print('索引标签0处的值：',s.loc[0])
print('索引位置为0处的值：',s.iloc[0])
print('索引标签在0到1(包括1)之间的行：\n',s.loc[0:1])
print('索引位置在0到1之间的行(不包含1)：\n',s.iloc[0:1])

构建的Series:
 49    a
48    c
47    b
0     d
1     f
2     e
dtype: object
索引标签0处的值： d
索引位置为0处的值： a
索引标签在0到1(包括1)之间的行：
 0    d
1    f
dtype: object
索引位置在0到1之间的行(不包含1)：
 49    a
dtype: object


In [52]:
s2 = pd.Series(s.index, index=s.values)
print('以英文标签为索引\n',s2)
print('所有在“c”和“e”之间的行(包括e):\n',s2.loc['c':'e'] )

以英文标签为索引
 a    49
c    48
b    47
d     0
f     1
e     2
dtype: int64
所有在“c”和“e”之间的行(包括e):
 c    48
b    47
d     0
f     1
e     2
dtype: int64


In [53]:
df = pd.DataFrame(np.arange(25).reshape(5, 5),
                      index=list('abcde'),
                      columns=['x','y','z', 8, 9])
print(df)
print('行c及以上列z以下\n',df.loc['c': , :'z'])
print('所有行，但只有索引位置3的列\n',df.iloc[:, 3])

    x   y   z   8   9
a   0   1   2   3   4
b   5   6   7   8   9
c  10  11  12  13  14
d  15  16  17  18  19
e  20  21  22  23  24
行c及以上列z以下
     x   y   z
c  10  11  12
d  15  16  17
e  20  21  22
所有行，但只有索引位置3的列
 a     3
b     8
c    13
d    18
e    23
Name: 8, dtype: int32


In [54]:
df = pd.DataFrame(np.arange(25).reshape(5, 5),
                      index=list('abcde'),
                      columns=['x','y','z', 8, 9])
print('原始数据\n',df)
print('loc和ilic混用\n',df.iloc[:df.index.get_loc('c') + 1, :4])

原始数据
     x   y   z   8   9
a   0   1   2   3   4
b   5   6   7   8   9
c  10  11  12  13  14
d  15  16  17  18  19
e  20  21  22  23  24
loc和ilic混用
     x   y   z   8
a   0   1   2   3
b   5   6   7   8
c  10  11  12  13


## 数据读取

In [55]:
df_csv=pd.read_csv('CSV数据.csv')
df_csv.head()
#读cscv

Unnamed: 0.1,Unnamed: 0,a,b,c
0,0,1,1,1
1,1,1,1,2
2,2,1,2,1
3,3,2,2,2
4,4,2,3,1


In [56]:
df_txt=pd.read_table('txt数据.txt',sep=',')
df_txt.head()
#读txt

Unnamed: 0.1,Unnamed: 0,a,b,c
0,0,1,1,1
1,1,1,1,2
2,2,1,2,1
3,3,2,2,2
4,4,2,3,1


In [57]:
df_excel=pd.read_excel('Excel数据.xlsx',sheet_name='test')
df_excel.head()
#读excel

Unnamed: 0.1,Unnamed: 0,a,b,c,Unnamed: 4
0,0,0,1,1,1
1,1,1,1,1,2
2,2,2,1,2,1
3,3,3,2,2,2
4,4,4,2,3,1


In [58]:
data={
    '城市':['北京','上海','广州','深圳'],
    '环比':[101.5,102.2,101.3,102.0],
    '同比':[120.7,127.3,119.4,140.9],
    '定基':[121.4,127.8,120.0,145.5]
}
df=pd.DataFrame(data,index=list('abcd'))
df

Unnamed: 0,城市,环比,同比,定基
a,北京,101.5,120.7,121.4
b,上海,102.2,127.3,127.8
c,广州,101.3,119.4,120.0
d,深圳,102.0,140.9,145.5


In [59]:
df.to_csv('test.csv')
#写csv

In [60]:
df.to_csv('test.txt')
#写txt

In [61]:
df.to_excel('test1.xlsx')
#写excel文件

## pandas统计功能

In [62]:
df = pd.DataFrame({
    'b':[1,2,2,3],
    'a':[4,3,2,1],
    'c':[1,3,8,2]},index=[2,0,1,3])
np.max(df['a'])

4

In [63]:
df.min()

b    1
a    1
c    1
dtype: int64

In [64]:
df['a'].max()

4

## 处理缺失值

In [65]:
df = pd.DataFrame({
    'a':[1,np.nan,2,3],
    'b':[4,np.nan,2,1],
    'c':[np.nan,np.nan,np.nan,np.nan]},
    index=[2,0,1,3])
df

Unnamed: 0,a,b,c
2,1.0,4.0,
0,,,
1,2.0,2.0,
3,3.0,1.0,


In [66]:
# 删除空数据,选择删除都是空的数据
df.dropna(how='all')

Unnamed: 0,a,b,c
2,1.0,4.0,
1,2.0,2.0,
3,3.0,1.0,


In [67]:
# 按照列删除空数据,选择删除都是空的数据
df.dropna(how='all',axis=1)

Unnamed: 0,a,b
2,1.0,4.0
0,,
1,2.0,2.0
3,3.0,1.0


In [68]:
# 填充数据,补充成某一个值
df.fillna(value=3.0)

Unnamed: 0,a,b,c
2,1.0,4.0,3.0
0,3.0,3.0,3.0
1,2.0,2.0,3.0
3,3.0,1.0,3.0


In [69]:
# 填充数据,按照方法填充
df.fillna(method='ffill')

Unnamed: 0,a,b,c
2,1.0,4.0,
0,1.0,4.0,
1,2.0,2.0,
3,3.0,1.0,


## 删除重复数据

In [70]:
df=pd.DataFrame({'one':[0,0,2,3],
    'two':[0,0,6,6],
    'three':[0,0,6,6],
    'four':[0,1,2,3],},
    index=['b','c','c','e'])
print(df)

   one  two  three  four
b    0    0      0     0
c    0    0      0     1
c    2    6      6     2
e    3    6      6     3


In [71]:
# 删除行重复数据,保留重复数据的第一行，数据不在原始数据上改动
df1=df.drop_duplicates(subset=['one','two'],keep='first',inplace=False)
df1

Unnamed: 0,one,two,three,four
b,0,0,0,0
c,2,6,6,2
e,3,6,6,3


## 连续数据分箱处理

In [72]:
data=[80, 55, 78, 99, 60, 55, 82, 57,56,89,55,90]
bins=[50,60,70,80,90,100]

In [73]:
#自动分箱
pd.cut(data, 3)

[(69.667, 84.333], (54.956, 69.667], (69.667, 84.333], (84.333, 99.0], (54.956, 69.667], ..., (54.956, 69.667], (54.956, 69.667], (84.333, 99.0], (54.956, 69.667], (84.333, 99.0]]
Length: 12
Categories (3, interval[float64]): [(54.956, 69.667] < (69.667, 84.333] < (84.333, 99.0]]

In [74]:
#根据规定的区间分箱
pd.cut(data, bins)

[(70, 80], (50, 60], (70, 80], (90, 100], (50, 60], ..., (50, 60], (50, 60], (80, 90], (50, 60], (80, 90]]
Length: 12
Categories (5, interval[int64]): [(50, 60] < (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [75]:
#给分好的数据进行标记
ans=pd.cut(data, bins,labels=['a','b','c','d','e'])
ans

['c', 'a', 'c', 'e', 'a', ..., 'a', 'a', 'd', 'a', 'd']
Length: 12
Categories (5, object): ['a' < 'b' < 'c' < 'd' < 'e']

In [76]:
#查看分类结果
ans.codes

array([2, 0, 2, 4, 0, 0, 3, 0, 0, 3, 0, 3], dtype=int8)

In [77]:
#查看分类区间
ans.categories

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

## 数据哑变量处理

In [78]:
df1=pd.DataFrame({'职业':['工人','学生','司机','教师','导游']})
df1

Unnamed: 0,职业
0,工人
1,学生
2,司机
3,教师
4,导游


In [79]:
#使用 get_dummies()函数来进行哑变量转换
pd.get_dummies(df1)

Unnamed: 0,职业_司机,职业_学生,职业_导游,职业_工人,职业_教师
0,0,0,0,1,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,0,0,0,1
4,0,0,1,0,0


In [80]:
#设置前后缀
pd.get_dummies(df1,prefix='o(*￣▽￣*)ブ')

Unnamed: 0,o(*￣▽￣*)ブ_司机,o(*￣▽￣*)ブ_学生,o(*￣▽￣*)ブ_导游,o(*￣▽￣*)ブ_工人,o(*￣▽￣*)ブ_教师
0,0,0,0,1,0
1,0,1,0,0,0
2,1,0,0,0,0
3,0,0,0,0,1
4,0,0,1,0,0


In [81]:
#增加一列空值
pd.get_dummies(df1,prefix='o(*￣▽￣*)ブ',dummy_na=True)

Unnamed: 0,o(*￣▽￣*)ブ_司机,o(*￣▽￣*)ブ_学生,o(*￣▽￣*)ブ_导游,o(*￣▽￣*)ブ_工人,o(*￣▽￣*)ブ_教师,o(*￣▽￣*)ブ_nan
0,0,0,0,1,0,0
1,0,1,0,0,0,0
2,1,0,0,0,0,0
3,0,0,0,0,1,0
4,0,0,1,0,0,0


## Pandas按键合并数据

In [82]:
df1=pd.DataFrame({'first':[2,3,8],
                'second':[4,5,6]},
               index=['aa','b','c'])

df2=pd.DataFrame({'second':[6,5,8],
                'third':[17,18,19]},
               index=['b','c','aa'])

In [83]:
df1

Unnamed: 0,first,second
aa,2,4
b,3,5
c,8,6


In [84]:
df2

Unnamed: 0,second,third
b,6,17
c,5,18
aa,8,19


In [85]:
#默认参数下的聚合效果
pd.concat([df1,df2])

Unnamed: 0,first,second,third
aa,2.0,4,
b,3.0,5,
c,8.0,6,
b,,6,17.0
c,,5,18.0
aa,,8,19.0


In [86]:
#inner参数下的聚合效果
pd.concat([df1,df2],join='inner')

Unnamed: 0,second
aa,4
b,5
c,6
b,6
c,5
aa,8


In [87]:
# 横向聚合效果
pd.concat([df1,df2],axis=1)

Unnamed: 0,first,second,second.1,third
aa,2,4,8,19
b,3,5,6,17
c,8,6,5,18


In [88]:
df1.append(df2)

Unnamed: 0,first,second,third
aa,2.0,4,
b,3.0,5,
c,8.0,6,
b,,6,17.0
c,,5,18.0
aa,,8,19.0


In [89]:
df1.append(df2,ignore_index=True)

Unnamed: 0,first,second,third
0,2.0,4,
1,3.0,5,
2,8.0,6,
3,,6,17.0
4,,5,18.0
5,,8,19.0


## Pandas值合并函数：merge

In [90]:
df1=pd.DataFrame({'first':[2,3,8],
                'second':[4,5,6]},
               index=['aa','aa','c'])

df2=pd.DataFrame({'second':[6,5,8],
                'third':[17,8,19]},
               index=['b','c','aa'])

In [91]:
df1

Unnamed: 0,first,second
aa,2,4
aa,3,5
c,8,6


In [92]:
df2

Unnamed: 0,second,third
b,6,17
c,5,8
aa,8,19


In [93]:
# 默认参数下的merge聚合效果
pd.merge(df1,df2)

Unnamed: 0,first,second,third
0,3,5,8
1,8,6,17


In [94]:
# 修改how参数示例
pd.merge(df1,df2,how='outer')

Unnamed: 0,first,second,third
0,2.0,4,
1,3.0,5,8.0
2,8.0,6,17.0
3,,8,19.0


In [95]:
# 修改left_on和right_on参数示例
pd.merge(df1,df2,left_on='first',right_on='second')

Unnamed: 0,first,second_x,second_y,third
0,8,6,8,19


In [96]:
# 上述结果下修改suffix示例
pd.merge(df1,df2, left_on='first', right_on='second',suffixes=('(*￣▽￣*)','(●`◡`●)'))

Unnamed: 0,first,second(*￣▽￣*),second(●`◡`●),third
0,8,6,8,19


In [97]:
# 默认参数下的join聚合效果
df1.join(df2,lsuffix='_left', rsuffix='_right')

Unnamed: 0,first,second_left,second_right,third
aa,2,4,8,19
aa,3,5,8,19
c,8,6,5,8


In [98]:
#join函数的VLOOKUP功能
# a.join(b.set_index(‘key_b’),on=‘key_a’)
df1.join(df2.set_index('second'),on='first')

Unnamed: 0,first,second,third
aa,2,4,
aa,3,5,
c,8,6,19.0


In [99]:
# Pandas合并重叠数据
df1 = pd.DataFrame({'a': [np.nan,2., 4., np.nan],
                    'b': [1.,np.nan, 3., np.nan],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [3., np.nan,5., 7., 9.],
                    'b': [np.nan, 2., 4., 6., 8.]})

In [100]:
df1

Unnamed: 0,a,b,c
0,,1.0,2
1,2.0,,6
2,4.0,3.0,10
3,,,14


In [101]:
df2

Unnamed: 0,a,b
0,3.0,
1,,2.0
2,5.0,4.0
3,7.0,6.0
4,9.0,8.0


In [102]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,3.0,1.0,2.0
1,2.0,2.0,6.0
2,4.0,3.0,10.0
3,7.0,6.0,14.0
4,9.0,8.0,


## groupby技术

In [103]:
df1=pd.DataFrame({'sex':list('FFMFMMF'),
                  'smoker':list('YNYYNYY'),
                  'age':[21,30,17,37,40,18,26],
                  'weight':[120,100,132,140,94,89,123]})
grouped=df1.groupby(['sex','smoker'])

## groupby常见分组示例

In [104]:
# 单键分组
df1.groupby('sex').mean()

Unnamed: 0_level_0,age,weight
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,28.5,120.75
M,25.0,105.0


In [105]:
# 多键分组
df1.groupby(['sex','smoker']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,weight
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
F,N,30.0,100.0
F,Y,28.0,127.666667
M,N,40.0,94.0
M,Y,17.5,110.5


In [106]:
# 分组后对指定列的值进行计算
df1.groupby(['sex','smoker'])['age'].mean()

sex  smoker
F    N         30.0
     Y         28.0
M    N         40.0
     Y         17.5
Name: age, dtype: float64

## groupby逐列多函数应用

In [107]:
df=pd.DataFrame({'sex':list('FFMFMMF'),
                  'smoker':list('YNYYNYY'),
                  'age':[21,30,17,37,40,18,26],
                  'weight':[120,100,132,140,94,89,123]})

In [108]:
# 数据分组
grouped=df.groupby('sex')['weight']

In [109]:
# 对分组后数据应用多个函数
grouped.agg(['mean','sum','count'])

Unnamed: 0_level_0,mean,sum,count
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,120.75,483,4
M,105.0,315,3


In [110]:
def func(arr):
    return np.mean(arr)/sum(arr)

In [111]:
# 函数可以是自定义函数
grouped.agg(['mean','sum',func])

Unnamed: 0_level_0,mean,sum,func
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,120.75,483,0.25
M,105.0,315,0.333333


In [112]:
# 分组后数据应用多个函数
df.groupby('smoker')['weight','age'].agg({'weight':['mean','sum'],'age':['count','mean']})

  df.groupby('smoker')['weight','age'].agg({'weight':['mean','sum'],'age':['count','mean']})


Unnamed: 0_level_0,weight,weight,age,age
Unnamed: 0_level_1,mean,sum,count,mean
smoker,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
N,97.0,194,2,35.0
Y,120.8,604,5,23.8


## apply函数

In [3]:
import pandas as pd
df=pd.DataFrame({'sex':list('FFMFMMF'),
                  'smoker':list('YNYYNYY'),
                  'age':[21,30,17,37,40,18,26],
                  'weight':[120,100,132,140,94,89,123]})
df

Unnamed: 0,sex,smoker,age,weight
0,F,Y,21,120
1,F,N,30,100
2,M,Y,17,132
3,F,Y,37,140
4,M,N,40,94
5,M,Y,18,89
6,F,Y,26,123


In [4]:
df['公斤']=df['weight'].apply(lambda x : x/2)
df

Unnamed: 0,sex,smoker,age,weight,公斤
0,F,Y,21,120,60.0
1,F,N,30,100,50.0
2,M,Y,17,132,66.0
3,F,Y,37,140,70.0
4,M,N,40,94,47.0
5,M,Y,18,89,44.5
6,F,Y,26,123,61.5


In [7]:
# apply函数对整个分组下的整体数据应用，可以返回多个值
def func(x,N,asc):
    return x.sort_values('公斤',ascending=asc)[:N]
df.groupby("sex").apply(func,N=2,asc=False)
# df.groupby("sex").agg(func,N=2,asc=False) #agg不可以，会报错

Unnamed: 0_level_0,Unnamed: 1_level_0,sex,smoker,age,weight,公斤
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,3,F,Y,37,140,70.0
F,6,F,Y,26,123,61.5
M,2,M,Y,17,132,66.0
M,4,M,N,40,94,47.0


In [14]:
# apply函数一次只能应用一个方法
df.groupby("sex").apply(sum)
# df.groupby("sex").apply(sum,np.mean)#多个函数不可以同时使用，会报错

Unnamed: 0_level_0,sex,smoker,age,weight,公斤
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,FFFF,YNYY,114,483,241.5
M,MMM,YNY,75,315,157.5


In [20]:
# apply函数可以选择方向
def func2(arr):
    if arr[0]=='F':
        a=1
    else:
        a=0
    if arr[1]=='Y':
        b=1
    else:
        b=0
    return a+b+sum(arr[2:4])

#横向结算结果
df.apply(func2,axis=1)

0    143
1    131
2    150
3    179
4    134
5    108
6    151
dtype: int64

## 数据透视表

In [115]:
df=pd.read_excel('Excel数据.xlsx',sheet_name=0)
df.head()

Unnamed: 0,订单号,销售日期,销售人员,地区,城市,家电品牌,单价,数量（台）,销售额
0,10240,2009-01-02,张三,华北,北京,奥克斯,1200,4,4800
1,10241,2009-01-03,李四,华北,北京,格力,1300,5,6500
2,10242,2009-01-13,钱五,华北,北京,美的,1250,6,7500
3,10243,2009-01-14,赵六,华北,北京,春兰,1500,3,4500
4,10244,2009-01-25,刘琦,华北,石家庄,海尔,1500,5,7500


In [116]:
# 透视表举例一：
pd.pivot_table(df,values='销售额',index='地区',columns='家电品牌',aggfunc='sum')

家电品牌,奥克斯,志高,春兰,松下,格力,海尔,美的
地区,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
东北,19200.0,17600.0,11100.0,13500.0,13500.0,9000.0,16205.0
华东,10000.0,2000.0,15300.0,29400.0,34800.0,4500.0,17550.0
华中,12000.0,,,23100.0,7200.0,,10800.0
华北,14400.0,,4500.0,21000.0,17300.0,19500.0,28500.0
华南,9600.0,3000.0,6000.0,31800.0,27600.0,,48200.0
西北,6000.0,8000.0,13500.0,12900.0,30000.0,25100.0,22800.0
西南,41207.0,22600.0,9600.0,18600.0,22007.0,,10800.0


In [117]:
# 透视表举例二
pd.pivot_table(df,values='销售额',index='地区',columns='家电品牌',aggfunc=['sum','count'])

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,sum,count,count,count,count,count,count,count
家电品牌,奥克斯,志高,春兰,松下,格力,海尔,美的,奥克斯,志高,春兰,松下,格力,海尔,美的
地区,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
东北,19200.0,17600.0,11100.0,13500.0,13500.0,9000.0,16205.0,3.0,2.0,2.0,1.0,2.0,1.0,3.0
华东,10000.0,2000.0,15300.0,29400.0,34800.0,4500.0,17550.0,2.0,1.0,3.0,3.0,3.0,1.0,3.0
华中,12000.0,,,23100.0,7200.0,,10800.0,2.0,,,1.0,2.0,,2.0
华北,14400.0,,4500.0,21000.0,17300.0,19500.0,28500.0,2.0,,1.0,1.0,2.0,2.0,4.0
华南,9600.0,3000.0,6000.0,31800.0,27600.0,,48200.0,2.0,1.0,1.0,3.0,5.0,,6.0
西北,6000.0,8000.0,13500.0,12900.0,30000.0,25100.0,22800.0,1.0,2.0,1.0,3.0,4.0,3.0,2.0
西南,41207.0,22600.0,9600.0,18600.0,22007.0,,10800.0,5.0,2.0,2.0,2.0,4.0,,1.0


## 交叉表

In [118]:
df = pd.DataFrame({'A': [1, 0, 1, 1, 0],
                   'B': [1, 1, 1, 0, 0],
                   'C': [1, 1, np.nan, 1, 1]})
#对A和B进行交叉
pd.crosstab(df['A'], df['B'])

B,0,1
A,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,1
1,1,2


In [119]:
#可以对交叉后的结果进行标准化
pd.crosstab(df['A'], df['B'], normalize=True)

B,0,1
A,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.2,0.2
1,0.2,0.4


In [120]:
#对每列进行标准化：
pd.crosstab(df['A'], df['B'], normalize='columns')

B,0,1
A,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.5,0.333333
1,0.5,0.666667


In [121]:
#指定列做为值，并将这些值按一定算法进行聚合：
pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum)

B,0,1
A,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,1.0
1,1.0,1.0


In [122]:
#在最右边增加一个汇总列：
pd.crosstab(df['A'], df['B'],
            values=df['C'],
            aggfunc=np.sum,
            normalize=True,
            margins=True)

B,0,1,All
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.25,0.25,0.5
1,0.25,0.25,0.5
All,0.5,0.5,1.0


## 时间戳

In [123]:
# 时间戳函数用法一
pd.Timestamp(5555,unit='s')

Timestamp('1970-01-01 01:32:35')

In [124]:
# 时间戳函数用法二
pd.Timestamp(5555,unit='D')

Timestamp('1985-03-18 00:00:00')

In [125]:
# 整列数据转换时间
data=['2017/8/1','2018/8/1','2018/8/3','2018/8/4/','2018/8/7']
data

['2017/8/1', '2018/8/1', '2018/8/3', '2018/8/4/', '2018/8/7']

In [126]:
timedata=pd.to_datetime(data)
timedata

DatetimeIndex(['2017-08-01', '2018-08-01', '2018-08-03', '2018-08-04',
               '2018-08-07'],
              dtype='datetime64[ns]', freq=None)

In [127]:
timedata=pd.to_datetime(data)
timedata

DatetimeIndex(['2017-08-01', '2018-08-01', '2018-08-03', '2018-08-04',
               '2018-08-07'],
              dtype='datetime64[ns]', freq=None)

## 时间序列

In [128]:
# 生成一个时间序列，从12：00开始，到23：59结束，时间间隔30分钟
pd.date_range("12:00", "23:59", freq="30min")

DatetimeIndex(['2021-08-27 12:00:00', '2021-08-27 12:30:00',
               '2021-08-27 13:00:00', '2021-08-27 13:30:00',
               '2021-08-27 14:00:00', '2021-08-27 14:30:00',
               '2021-08-27 15:00:00', '2021-08-27 15:30:00',
               '2021-08-27 16:00:00', '2021-08-27 16:30:00',
               '2021-08-27 17:00:00', '2021-08-27 17:30:00',
               '2021-08-27 18:00:00', '2021-08-27 18:30:00',
               '2021-08-27 19:00:00', '2021-08-27 19:30:00',
               '2021-08-27 20:00:00', '2021-08-27 20:30:00',
               '2021-08-27 21:00:00', '2021-08-27 21:30:00',
               '2021-08-27 22:00:00', '2021-08-27 22:30:00',
               '2021-08-27 23:00:00', '2021-08-27 23:30:00'],
              dtype='datetime64[ns]', freq='30T')

In [129]:
# 生成一个时间序列，从2021-06-27开始，到2021-09-27结束，时间间隔3天
pd.date_range("2021-06-27", "2021-09-27",freq="3D")

DatetimeIndex(['2021-06-27', '2021-06-30', '2021-07-03', '2021-07-06',
               '2021-07-09', '2021-07-12', '2021-07-15', '2021-07-18',
               '2021-07-21', '2021-07-24', '2021-07-27', '2021-07-30',
               '2021-08-02', '2021-08-05', '2021-08-08', '2021-08-11',
               '2021-08-14', '2021-08-17', '2021-08-20', '2021-08-23',
               '2021-08-26', '2021-08-29', '2021-09-01', '2021-09-04',
               '2021-09-07', '2021-09-10', '2021-09-13', '2021-09-16',
               '2021-09-19', '2021-09-22', '2021-09-25'],
              dtype='datetime64[ns]', freq='3D')

## 时间索引

In [130]:
str_index=['2017/8/1','2018/8/1','2018/8/3','2018/8/4/','2018/8/7']
df1=pd.DataFrame([1,2,3,4,5],index=str_index)
#生成一个同样索引为字符串的Series
df1

Unnamed: 0,0
2017/8/1,1
2018/8/1,2
2018/8/3,3
2018/8/4/,4
2018/8/7,5


In [131]:
time_index=pd.DatetimeIndex(['2017/8/1','2018/8/1','2018/8/3','2018/8/4/','2018/8/7'])
df2=pd.DataFrame([1,2,3,4,5],index=time_index)
#生成同样内容的Series索引格式是DatatimeIndex
df2

Unnamed: 0,0
2017-08-01,1
2018-08-01,2
2018-08-03,3
2018-08-04,4
2018-08-07,5


In [132]:
#对两个数据同时使用下列索引进行取值
df1['2017-08-2':'2018-08-5']
#非时间格式无法匹配字符串，无法根据时间区间取值

Unnamed: 0,0
2017/8/1,1


In [133]:
df2['2017-08-2':'2018-08-5']
#时间格式可以取到值

Unnamed: 0,0
2018-08-01,2
2018-08-03,3
2018-08-04,4


## 时间索引操作

In [134]:
time_index=pd.DatetimeIndex(['2017/8/20','2017/8/3','2018/7/1','2018/7/3','2018/8/4/','2018/8/7'])
df=pd.DataFrame([1,2,3,4,5,6],index=time_index)
df

Unnamed: 0,0
2017-08-20,1
2017-08-03,2
2018-07-01,3
2018-07-03,4
2018-08-04,5
2018-08-07,6


In [135]:
# 取指定年、月、日的数据
print(df[df.index.year==2018])
print('*'*30)
print(df[df.index.month==7])
print('*'*30)
print(df[df.index.day==3])

            0
2018-07-01  3
2018-07-03  4
2018-08-04  5
2018-08-07  6
******************************
            0
2018-07-01  3
2018-07-03  4
******************************
            0
2017-08-03  2
2018-07-03  4


In [136]:
# 取指定时间段的数据
df[(df.index.day < 5) & (df.index.day >=1)]

Unnamed: 0,0
2017-08-03,2
2018-07-01,3
2018-07-03,4
2018-08-04,5


In [137]:
#时间抽样
df=pd.read_excel('Excel数据.xlsx',sheet_name=0)
df.index=df['销售日期']

In [138]:
#降低采集频率为每月一次
df.resample('M',).count()

Unnamed: 0_level_0,订单号,销售日期,销售人员,地区,城市,家电品牌,单价,数量（台）,销售额
销售日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-31,10,10,10,10,10,10,10,10,10
2009-02-28,6,6,6,6,6,6,6,6,6
2009-03-31,8,8,8,8,8,8,8,8,8
2009-04-30,8,8,8,8,8,8,8,8,8
2009-05-31,11,11,11,11,11,11,11,11,11
2009-06-30,11,11,11,11,11,11,11,11,11
2009-07-31,9,9,9,9,9,9,9,9,9
2009-08-31,9,9,9,9,9,9,9,9,9
2009-09-30,10,10,10,10,10,10,10,10,10
2009-10-31,9,9,9,9,9,9,9,9,9
