In [1]:
import numpy as np
import pandas as pd

# 排序

## sort_index 对行或列索引进行排序

In [3]:
ser1 = pd.Series(np.arange(0,4),index = ['b','c','d','a'])
ser1

b    0
c    1
d    2
a    3
dtype: int32

In [4]:
ser1.sort_index()  #通过列或者行索引进行排序

a    3
b    0
c    1
d    2
dtype: int32

In [6]:
ser1.sort_index(ascending=False) #倒序 由大到小

d    2
c    1
b    0
a    3
dtype: int32

In [7]:
df1 = pd.DataFrame(np.random.randint(1,9,(4,4)),
                   columns= ['b','c','a','d'],
                   index = ['B','D','A','C'])
df1

Unnamed: 0,b,c,a,d
B,8,8,8,8
D,7,6,6,3
A,7,1,1,8
C,1,7,6,4


In [8]:
df1.sort_index()  #通过行索引排序

Unnamed: 0,b,c,a,d
A,7,1,1,8
B,8,8,8,8
C,1,7,6,4
D,7,6,6,3


In [9]:
df1.sort_index(axis = 1)  #通过列索引进行排序

Unnamed: 0,a,b,c,d
B,8,8,8,8
D,6,7,6,3
A,1,7,1,8
C,6,1,7,4


In [10]:
#通过列索引进行排序
df1.sort_index(axis = 1,ascending=False)  

Unnamed: 0,d,c,b,a
B,8,8,8,8
D,3,6,7,6
A,8,1,7,1
C,4,7,1,6


## sort_values 对Series按值进行排序, 排序时，任何缺失值默认都会被放到Series的末尾。

In [11]:
ser1.sort_values()  #通过值进行排序

b    0
c    1
d    2
a    3
dtype: int32

In [12]:
df1.sort_values(by = ['d','b'])  #通过某列的值进行排序

Unnamed: 0,b,c,a,d
D,7,6,6,3
C,1,7,6,4
A,7,1,1,8
B,8,8,8,8


In [22]:
df1.sort_values(by=['d','b'],ascending=[0,1]) 
df1

Unnamed: 0,b,c,a,d
A,7,1,1,8
B,8,8,8,8
C,1,7,6,4
D,7,6,6,3


## 排名(rank)

In [None]:
ser=pd.Series([3,4,1,3,3],index=list('abcde'))
ser

In [None]:
ser.rank() # 默认method='average'

In [None]:
ser.rank(method='min')

In [None]:
ser.rank(method='max')

In [None]:
ser.rank(method='first')

In [None]:
df=pd.DataFrame(np.random.randint(0,5,[4,4]))
df

In [None]:
# 默认按照列排名
df.rank() 

In [None]:
# 按照行排名
df.rank(axis=1) 

# 分组+聚合+数据统计

## 数据统计（value_counts）

In [None]:
obj=pd.Series(['c','a','d','a','a','b','b','c','c'])
obj.value_counts()

## 分组（groupby）

In [6]:
data = pd.DataFrame({
    'key1':list('aabba'),  
    'key2': ['one','two','one','two','one'],  
    'data1': np.random.randn(5), 
#     'data1':['1','2','3','4','5'],
    'data2': np.random.randn(5)
})  

data

Unnamed: 0,data1,data2,key1,key2
0,1.188121,0.150637,a,one
1,-0.449035,0.777845,a,two
2,-2.009795,0.156113,b,one
3,0.850294,1.488159,b,two
4,0.80283,0.44234,a,one


In [7]:
grouped = data['data1'].groupby(data['key1'])
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x000002D1D49A4588>

## 聚合

In [8]:
grouped.mean()

key1
a    0.513972
b   -0.579751
Name: data1, dtype: float64

In [12]:
means = data['data1'].groupby([data['key1'], data['key2']]).mean()
print(type(means))
means

<class 'pandas.core.series.Series'>


key1  key2
a     one     0.995476
      two    -0.449035
b     one    -2.009795
      two     0.850294
Name: data1, dtype: float64

In [10]:
#通过两个键对数据进行了分组，得到的Series具有一个层次化索引
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.995476,-0.449035
b,-2.009795,0.850294


In [17]:
# 可以直接传入列名列表，进行分组操作
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
data['data1'].groupby([states, years]).mean()

California  2005   -0.449035
            2006   -2.009795
Ohio        2005    1.019207
            2006    0.802830
Name: data1, dtype: float64

## 自定义聚合操作（apply）

In [None]:
#声明一个函数
def f(x):
    return x-1
#创建数据
data = np.random.randint(1,10,(4,4))
df1 = pd.DataFrame(data,columns = ['a','b','c','d'])
df1

In [None]:
#对列进行操作
print(df1[['a','b']].apply(f))

In [None]:
#对行元素进行操作
print(df1.loc[0:2].apply(f))

In [None]:
#求和
def f2(x):
    return x.sum()

print(df1.loc[:,['a','b']].apply(f2))
print(df1.loc[:].apply(f2,axis = 1))  #axis = 1 行操作

In [None]:
#求平均
def f3(x):
    return x.mean()
print(df1.loc[:,:].apply(f3))
print(df1.loc[:].apply(f3,axis = 1))

In [None]:
#自定义操作-字符串拼接
data = np.random.randint(1,10,(4,4))
df1 = pd.DataFrame(data,columns = ['a','b','c','d'])
def f4(x,str_text):
     #第一个参数代表传入的每一个元素，第二个参数表示传入的参数
    return str(x)+str_text 
    
df1['a'].apply(f4,args = ('haha',))  #通过args传入参数元组

In [26]:
#自定义操作-时间数据提取
# 生成数据
# df=pd.DataFrame(np.random.randn(6,4),columns=list('ABCD'))
df['F']=pd.date_range('20180115',periods=6)
# df['F']=['20150115','20150116','20150117','20150118','20150119','20150120']
print(df)
# 时间提取
import time
def get_day(date_time):
#     time_tuple=time.strptime(str(date_time),'%Y%m%d')
    time_tuple=time.strptime(str(date_time),'%Y-%m-%d %H:%M:%S')
    return pd.Series(time_tuple[0:3])
A = df['F'].apply(get_day)
print(type(A))

          A         B         C         D          F
0  0.400928 -0.081043  0.270293 -0.657813 2018-01-15
1  0.594349  1.117013 -0.690679 -1.077358 2018-01-16
2  0.405129  0.278468 -0.501280  0.184518 2018-01-17
3  1.353222 -0.602726  0.227812 -0.059354 2018-01-18
4  0.003436  0.302138 -0.903067  0.258783 2018-01-19
5  1.366106 -1.397591 -0.053881 -0.970166 2018-01-20
<class 'pandas.core.frame.DataFrame'>


### 练习1

In [None]:
#请结合apply与groupy函数完成一下练习
#根据地区进行分组   查看平均年龄和工资
#根据年龄进行分组   查看平均工资
#根据性别进行分组   查看平均工资
#先根据地区，然后在根据性别进行分组，查看各地区不同性别的平均工资
df6 = pd.DataFrame({
    'name':['joe', 'susan', 'anne', 'black', 'monika','ronaldo','leonarldo','tom','yilianna','bulanni'],
    'age':[19,19,18,20,20,18,19,20,18,19],
    'sex':['man','women','women','man','women','man','man','man','women','women'],
    'address':['上海','北京','上海','北京','北京','上海','北京','上海','北京','上海'],
    'money':[8000,8500,7000,9000,10000,7500,8800,9300,12000,11000]
})
df6

In [None]:
def f1(x):
    return x.mean()

def f2(x):
    return x['money'].mean()
#根据地区进行分组   查看平均年龄和工资
print(df6.groupby(['address']).apply(f1))
print('-'*20)
#根据年龄进行分组   查看平均工资
print(df6.groupby(['age'])['money'].apply(f1))
print('-'*20)
#根据性别进行分组   查看平均工资
print(df6.groupby(['sex']).apply(f2))
print('-'*20)
#先根据地区，然后在根据性别进行分组，查看各地区不同性别的平均工资
print(df6.groupby(['address','sex']).apply(f2))

### 练习2

In [None]:
df = pd.DataFrame({'性别' : ['男', '女', '男', '女',
                              '男', '女', '男', '男'],
                       '成绩' : ['优秀', '优秀', '及格', '差',
                              '及格', '及格', '优秀', '差'],
                       '年龄' : [15,14,15,12,13,14,15,16]})
df

In [None]:
e

In [None]:
# 男女比例
# 得到男女总数
sex_count=df.groupby('性别')['性别'].count()
total_sex_count=df['性别'].count()
sex_count.apply(lambda x:str(x/total_sex_count*100)+'%')

In [None]:
    # 成绩分布，计算每个成绩段有多少人
sex_count=df.groupby('成绩')['成绩']
total_sex_count=df['成绩'].count()
sex_count.apply(lambda x:str(x.count())+'人')

# 制图

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
%matplotlib inline
data.plot()
data.plot(kind='bar')
data.plot(kind='barh')
data.plot(kind='kde')

## Series 制图

In [None]:
# 单线段
s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
s.plot()

In [None]:
# 多线段
y1 = np.array([600,500,800,400,300,500])  #进口
y2 = np.array([500,900,800,650,700,600]) #出口

ser1 = pd.Series(y1 ,index = ['1月','2月','3月','4月','5月','6月'])
ser2 = pd.Series(y2,index = ['1月','2月','3月','4月','5月','6月'])


ser1.plot(style = 'g-o',label = '鼓浪屿')
ser2.plot(label = '张家界')

plt.legend() # 生成图例

In [None]:
# 柱状图
ser1.plot(kind= 'bar',label = '鼓浪屿',color = 'r',alpha = 0.5,width = -0.2,align = 'edge' )
ser2.plot(kind= 'bar',label = '张家界',alpha = 0.5,width =0.2,align = 'edge',xticks= np.arange(1,7))
plt.legend()

## DataFrame制图

In [None]:
y = np.array([[600,500,800,400,300,500],[500,900,800,650,700,600]])  #进口

#行索引对应X轴，列索引对应label
df1 = pd.DataFrame(y.T,index = ['1月','2月','3月','4月','5月','6月'],columns=['鼓浪屿','张家界'])
df1.plot(style = ['r-o','b-o'])c

In [None]:
df1.plot(kind = 'bar',style = ['r-o','g-o'])
plt.legend()
plt.title('旅游人次')
plt.xlabel('月份')
plt.ylabel('人次/百万')