pandas中，选取特定行、列有两种方式：一种是操作符“[]”访问方式，称为Indexing Operator；另一种是通过.loc和.iloc方式，即Indexer（索引器）来选择特定行、列。

# 4.1 使用.loc和.iloc筛选行和列数据
    series和Dataframe具有极大的灵活性。.loc函数支持通过索引标签（Index Label）的方式访问数据，而.iloc函数支持通过整数索引的方式访问数据。

In [None]:
import pandas as pd

In [22]:
# 设置最大显示行列数
pd.set_option('display.max_rows',10)
pd.set_option('display.max_columns',10)


In [23]:
# 数据原目录
data_source = r"Y:\BaiduNetdiskWorkspace\data_analysis\Python数据分析\data"

In [24]:
# 导入数据
data = pd.read_csv(data_source+"\\Online_Retail_Fake.csv")

In [25]:
data['UnitPrice'].fillna(data['UnitPrice'].mean(),inplace=True)

In [26]:
data['UnitPrice'].isnull().sum()

0

In [27]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010/12/1 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010/12/1 8:26,4.611117,17850.0,United Kingdom
2,536365,84406B,,8,2010/12/1 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010/12/1 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010/12/1 8:26,3.39,17850.0,United Kingdom


In [28]:
data['Total_Price'] = data['Quantity']*data['UnitPrice']

In [29]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Total_Price
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010/12/1 8:26,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010/12/1 8:26,4.611117,17850.0,United Kingdom,27.666699
2,536365,84406B,,8,2010/12/1 8:26,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010/12/1 8:26,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010/12/1 8:26,3.39,17850.0,United Kingdom,20.34


## 4.1.1
    选择Series和DataFrame中的行

In [30]:
Country = data['Country']

In [31]:
Country.head()

0    United Kingdom
1    United Kingdom
2    United Kingdom
3    United Kingdom
4    United Kingdom
Name: Country, dtype: object

In [32]:
# 上面是用[]操作符对data求Series列，如果要获取特定的几行，可以用.iloc[]
print(Country.iloc[2])
print(Country.iloc[4])
print(Country.iloc[541908])

United Kingdom
United Kingdom
France


In [33]:
# 如果想访问多行数据，可以给.iloc传入一个list列表
print(Country.iloc[[1,2,3,541908]])

1         United Kingdom
2         United Kingdom
3         United Kingdom
541908            France
Name: Country, dtype: object


In [34]:
# 除了列表外，.iloc函数还支持切片操作
print(Country.iloc[2:549900:299])

2         United Kingdom
301       United Kingdom
600       United Kingdom
899       United Kingdom
1198      United Kingdom
               ...      
540594    United Kingdom
540893    United Kingdom
541192           Belgium
541491    United Kingdom
541790           Germany
Name: Country, Length: 1813, dtype: object


In [35]:
# .iloc适用于index是数字型的索引，如果索引是其他类型的，比如下面这种情况的时候，我们可以用.loc来索引。
college = pd.read_csv(data_source+"\\College.csv")

In [36]:
college.head()

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,...,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,...,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,...,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,...,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,...,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,...,72,11.9,2,10922,15


In [37]:
college.set_index('Unnamed: 0',inplace=True)

In [38]:
col_name = {'Unnamed: 0':'college_name'}

In [39]:
college.rename(columns=col_name,inplace=True)

In [40]:
college.set_index('college_name',inplace=True)

KeyError: "None of ['college_name'] are in the columns"

In [None]:
college

In [None]:
college.head(20)

In [41]:
# 利用.loc进行索引，切片。
college.loc['Abilene Christian University':'Angelo State University':2]

Unnamed: 0_level_0,Private,Apps,Accept,Enroll,Top10perc,...,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abilene Christian University,Yes,1660,1232,721,23,...,78,18.1,12,7041,60
Adrian College,Yes,1428,1097,336,22,...,66,12.9,30,8735,54
Alaska Pacific University,Yes,193,146,55,16,...,72,11.9,2,10922,15
Albertus Magnus College,Yes,353,340,103,17,...,93,11.5,26,8861,63
Albright College,Yes,1038,839,227,30,...,84,11.3,23,11644,80
Alfred University,Yes,1732,1425,472,37,...,88,11.3,31,10932,73
Allentown Coll. of St. Francis de Sales,Yes,1179,780,290,38,...,84,13.3,21,7940,74
Alverno College,Yes,494,313,157,23,...,69,11.1,26,8127,55
Amherst College,Yes,4302,992,418,83,...,98,8.4,63,21424,100
Andrews University,Yes,1130,704,322,14,...,66,11.5,18,10908,46


# 4.1.2 同时选择行与列
    利用类似下面的代码的方式，可以同时选择行和列
          df.iloc[rows,columns]
          df.loc[rows,columns]
    其中，rows代表了行的选择，既可以是列表，也可以是数据切片的方式输入，colunms代表了列的选择。

In [42]:
college

Unnamed: 0_level_0,Private,Apps,Accept,Enroll,Top10perc,...,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abilene Christian University,Yes,1660,1232,721,23,...,78,18.1,12,7041,60
Adelphi University,Yes,2186,1924,512,16,...,30,12.2,16,10527,56
Adrian College,Yes,1428,1097,336,22,...,66,12.9,30,8735,54
Agnes Scott College,Yes,417,349,137,60,...,97,7.7,37,19016,59
Alaska Pacific University,Yes,193,146,55,16,...,72,11.9,2,10922,15
...,...,...,...,...,...,...,...,...,...,...,...
Worcester State College,No,2197,1515,543,4,...,60,21.0,14,4469,40
Xavier University,Yes,1959,1805,695,24,...,75,13.3,31,9189,83
Xavier University of Louisiana,Yes,2097,1915,695,34,...,75,14.4,20,8323,49
Yale University,Yes,10705,2453,1317,95,...,96,5.8,49,40386,99


In [43]:
# 利用.loc选择行与列
college.loc['Adelphi University':'Alaska Pacific University',              ##行
            :'Enroll']                                                     ##列

Unnamed: 0_level_0,Private,Apps,Accept,Enroll
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelphi University,Yes,2186,1924,512
Adrian College,Yes,1428,1097,336
Agnes Scott College,Yes,417,349,137
Alaska Pacific University,Yes,193,146,55


In [44]:
# 当然也可以用.iloc选择行与列
college.iloc[1:4,:4]                                               # 与.loc形式完全相同


Unnamed: 0_level_0,Private,Apps,Accept,Enroll
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelphi University,Yes,2186,1924,512
Adrian College,Yes,1428,1097,336
Agnes Scott College,Yes,417,349,137


## 4.2 布尔选择
    上一节中已经学到了如何选择行列来筛选数据。但是一般还是更多的选择布尔选择方式来筛选数据。
    基本思路：
        通过对Pandas中的Series和DataFrame进行逻辑运算后得到一个新的Series或者DataFrame，其中的数据就是布尔类型，
        利用布尔类型的数据来进行数据分析。

In [45]:
# 引入数据
cwur = pd.read_csv(data_source+"\\cwurData.csv")
cwur.head(10)

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,...,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,...,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,...,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,...,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,...,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,...,22,,18,85.21,2012
5,6,Princeton University,USA,5,8,...,26,,101,82.5,2012
6,7,University of Oxford,United Kingdom,2,13,...,19,,26,82.34,2012
7,8,Yale University,USA,6,14,...,15,,66,79.14,2012
8,9,Columbia University,USA,7,23,...,14,,5,78.86,2012
9,10,"University of California, Berkeley",USA,8,16,...,3,,16,78.55,2012


## 4.2.1 计算布尔值

In [46]:
# 选择那些score>85的学校，先创造出一列新的Series，内容是布尔型

loc = cwur.columns.get_loc('score')  # 先用columns.get_loc获取列的位置

cwur.insert(loc= loc+1,column='score_bool',value=cwur['score']>=85)   # 再用insert()函数插入数据

In [47]:
cwur

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,...,broad_impact,patents,score,score_bool,year
0,1,Harvard University,USA,1,7,...,,5,100.00,True,2012
1,2,Massachusetts Institute of Technology,USA,2,9,...,,1,91.67,True,2012
2,3,Stanford University,USA,3,17,...,,15,89.50,True,2012
3,4,University of Cambridge,United Kingdom,1,10,...,,50,86.17,True,2012
4,5,California Institute of Technology,USA,4,2,...,,18,85.21,True,2012
...,...,...,...,...,...,...,...,...,...,...,...
2195,996,University of the Algarve,Portugal,7,367,...,969.0,816,44.03,False,2015
2196,997,Alexandria University,Egypt,4,236,...,981.0,871,44.03,False,2015
2197,998,Federal University of Ceará,Brazil,18,367,...,975.0,824,44.03,False,2015
2198,999,University of A Coruña,Spain,40,367,...,975.0,651,44.02,False,2015


In [48]:
# 求score>=85的学校的数目以及比例
cwur['score_bool'].sum()
# 利用mean还可以求出比例
cwur['score_bool'].mean()            # True=1,False=0,所以mean()可以求出比例

0.015909090909090907

In [49]:
# 除了与一个值进行比较，还可以几列之间比较得出bool值
cwur['bool_2'] = cwur[ 'quality_of_education']>cwur['alumni_employment']

In [50]:
cwur[['quality_of_education','alumni_employment']]

Unnamed: 0,quality_of_education,alumni_employment
0,7,9
1,9,17
2,17,11
3,10,24
4,2,29
...,...,...
2195,367,567
2196,236,566
2197,367,549
2198,367,567


In [51]:
cwur

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,...,patents,score,score_bool,year,bool_2
0,1,Harvard University,USA,1,7,...,5,100.00,True,2012,False
1,2,Massachusetts Institute of Technology,USA,2,9,...,1,91.67,True,2012,False
2,3,Stanford University,USA,3,17,...,15,89.50,True,2012,True
3,4,University of Cambridge,United Kingdom,1,10,...,50,86.17,True,2012,False
4,5,California Institute of Technology,USA,4,2,...,18,85.21,True,2012,False
...,...,...,...,...,...,...,...,...,...,...,...
2195,996,University of the Algarve,Portugal,7,367,...,816,44.03,False,2015,False
2196,997,Alexandria University,Egypt,4,236,...,871,44.03,False,2015,False
2197,998,Federal University of Ceará,Brazil,18,367,...,824,44.03,False,2015,False
2198,999,University of A Coruña,Spain,40,367,...,651,44.02,False,2015,False


In [52]:
# 对score>=85和cwur[ 'quality_of_education']>cwur['alumni_employment']同时满足  
(cwur['score']>=85) & \
(cwur[ 'quality_of_education']>cwur['alumni_employment'])            # 要用()包起来否则会出错

0       False
1       False
2        True
3       False
4       False
        ...  
2195    False
2196    False
2197    False
2198    False
2199    False
Length: 2200, dtype: bool

##  4.2.2 多条件筛选数据

In [53]:
## 利用多个条件做出bool_Series
##　比如求出中国排名前100的大学
crit_1 = cwur['country']=='China'
crit_2 = cwur['world_rank']<=100

cwur['China world\'s rank before 100'] = crit_1 & crit_2
cwur['China world\'s rank before 100'].sum()

cwur[cwur['China world\'s rank before 100']]         # 利用bool值对DataFrame进行筛选

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,...,score,score_bool,year,bool_2,China world's rank before 100
254,55,Peking University,China,1,355,...,55.3,False,2014,True,True
286,87,Tsinghua University,China,2,294,...,52.6,False,2014,True,True
1255,56,Peking University,China,1,182,...,54.26,False,2015,True,True
1277,78,Tsinghua University,China,2,309,...,52.21,False,2015,True,True


![按照index排序](数据按照index排序.jpg)

In [54]:
sh = pd.read_csv(data_source+"\\sh.csv")

In [55]:
sh.head()

Unnamed: 0,date,open,high,close,low,volume,amount
0,2019-08-30,2907.383,2914.577,2886.237,2874.103,19395995100,224751169931
1,2019-08-29,2895.999,2898.605,2890.919,2878.588,17861308200,196332770521
2,2019-08-28,2901.627,2905.435,2893.756,2887.012,18309790300,201805050637
3,2019-08-27,2879.515,2919.644,2902.193,2879.406,20814179400,230999692857
4,2019-08-26,2851.016,2870.494,2863.567,2849.238,16989536300,191036667851


In [56]:
sh.set_index('date',inplace=True)

In [57]:
sh.head()

Unnamed: 0_level_0,open,high,close,low,volume,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-08-30,2907.383,2914.577,2886.237,2874.103,19395995100,224751169931
2019-08-29,2895.999,2898.605,2890.919,2878.588,17861308200,196332770521
2019-08-28,2901.627,2905.435,2893.756,2887.012,18309790300,201805050637
2019-08-27,2879.515,2919.644,2902.193,2879.406,20814179400,230999692857
2019-08-26,2851.016,2870.494,2863.567,2849.238,16989536300,191036667851


In [58]:
sh.sort_index(inplace=True)

In [59]:
sh.head()

Unnamed: 0_level_0,open,high,close,low,volume,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-01-05,1849.02,1880.716,1880.716,1844.094,6713671200,46100959232
2009-01-06,1878.827,1938.69,1937.145,1871.971,9906675200,69012570112
2009-01-07,1938.974,1948.233,1924.012,1920.515,9236008800,63931166720
2009-01-08,1890.242,1894.171,1878.181,1862.263,8037400000,55076814848
2009-01-09,1875.164,1909.349,1904.861,1875.164,7122477600,50131263488


In [78]:
# 获得了每日的close涨幅。
sh['perc'] = sh['close'].pct_change(periods=1)
sh.head() 

Unnamed: 0_level_0,open,high,close,low,volume,amount,perc
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009-01-05,1849.02,1880.716,1880.716,1844.094,6713671200,46100959232,
2009-01-06,1878.827,1938.69,1937.145,1871.971,9906675200,69012570112,0.030004
2009-01-07,1938.974,1948.233,1924.012,1920.515,9236008800,63931166720,-0.00678
2009-01-08,1890.242,1894.171,1878.181,1862.263,8037400000,55076814848,-0.023821
2009-01-09,1875.164,1909.349,1904.861,1875.164,7122477600,50131263488,0.014205


In [87]:
##下面获取极端数据的数目
extreme_condition = (sh['perc']>=0.025)  |   (sh['perc']<=-0.025)
extreme = sh[extreme_condition]
extreme_num = extreme.shape[0]
# 求比例
nomal_num = sh['perc'].shape[0]
rd = extreme_num/nomal_num

In [89]:
print(extreme_num)
print(rd)

199
0.07797805642633229
