<div style="text-align: right">Excel to Python: Chapter5 & 6 Extract and Filter</div>
<div style="text-align: right">Zixiao With Material from 从Excel到Python</div>
<div style="text-align: right">December, 21, 2019</div>

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None # Display all column of dataframe

In [2]:
df = pd.DataFrame(pd.read_csv('./data/PRSA_Data_20130301-20170228/PRSA_Data_Aotizhongxin_20130301-20170228.csv'))
# df=pd.DataFrame(pd.read_Excel('name.xlsx'))

In [3]:
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006], 
                   "date":pd.date_range('20130102', periods=6),
                   "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
                   "age":[23,44,54,32,34,32], 
                   "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
                   "price":[1200,np.nan,2133,5433,np.nan,4432]}, 
                  columns =['id','date','city','category','age','price'])

# Chapter 5 Data Extract

## Extract by Index

In [4]:
df.loc[2]

id                         1003
date        2013-01-04 00:00:00
city                 guangzhou 
category                  110-A
age                          54
price                      2133
Name: 2, dtype: object

In [5]:
df.loc[:2]

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0


## Reset the index

In [6]:
df.reset_index()

Unnamed: 0,index,id,date,city,category,age,price
0,0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1,1002,2013-01-03,SH,100-B,44,
2,2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,4,1005,2013-01-06,shanghai,210-A,34,
5,5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [14]:
df = df.set_index('date')

In [26]:
# Extract data before 2013-01-05
df.loc[:'2013-01-05']

Unnamed: 0_level_0,id,city,category,age,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,1001,Beijing,100-A,23,1200.0
2013-01-03,1002,SH,100-B,44,
2013-01-04,1003,guangzhou,110-A,54,2133.0
2013-01-05,1004,Shenzhen,110-C,32,5433.0


## Extract by Location

In [27]:
df.iloc[2:3]

Unnamed: 0_level_0,id,city,category,age,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-04,1003,guangzhou,110-A,54,2133.0


In [28]:
# At the same time if we
df.loc[2:3]

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.datetimes.DatetimeIndex'> with these indexers [2] of <class 'int'>

## Extract by ```ix```

In [29]:
df.ix[:'2013-01-04']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,id,city,category,age,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,1001,Beijing,100-A,23,1200.0
2013-01-03,1002,SH,100-B,44,
2013-01-04,1003,guangzhou,110-A,54,2133.0


In [31]:
df.ix[:3]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,id,city,category,age,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,1001,Beijing,100-A,23,1200.0
2013-01-03,1002,SH,100-B,44,
2013-01-04,1003,guangzhou,110-A,54,2133.0


* ```loc``` will extract by index, includes start and end
* ```iloc``` will extract by location, includes start without end.
* ```is``` will extract by index's label. If it can not extract it will turn to extract by index's location

## Extract by Condition

In [40]:
df['city'].map(str.strip).str.lower().isin(['beijing'])

date
2013-01-02     True
2013-01-03    False
2013-01-04    False
2013-01-05    False
2013-01-06    False
2013-01-07     True
Name: city, dtype: bool

In [41]:
# Combain isin with loc
df.loc[df['city'].map(str.strip).str.lower().isin(['beijing','shanghai'])]

Unnamed: 0_level_0,id,city,category,age,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,1001,Beijing,100-A,23,1200.0
2013-01-06,1005,shanghai,210-A,34,
2013-01-07,1006,BEIJING,130-F,32,4432.0


In [44]:
# Extract and split the column values
pd.DataFrame((x.split('-') for x in df['category']),index = df.index,columns=['size','category'])

Unnamed: 0_level_0,size,category
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,100,A
2013-01-03,100,B
2013-01-04,110,A
2013-01-05,110,C
2013-01-06,210,A
2013-01-07,130,F


## Chapter 6 Data Filter

## Filter by Condition (and or not)

In [53]:
# Filter age > 25 and city is beijign
df['city'] = df['city'].map(str.strip).str.lower()
df.head(10)


Unnamed: 0_level_0,id,city,category,age,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,1001,beijing,100-A,23,1200.0
2013-01-03,1002,sh,100-B,44,
2013-01-04,1003,guangzhou,110-A,54,2133.0
2013-01-05,1004,shenzhen,110-C,32,5433.0
2013-01-06,1005,shanghai,210-A,34,
2013-01-07,1006,beijing,130-F,32,4432.0


In [60]:
df.loc[(df['age'] >20)&(df['city']=='beijing'),['price','age']]

Unnamed: 0_level_0,price,age
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,1200.0,23
2013-01-07,4432.0,32


In [66]:
# Filter and Sort
df.loc[(df['city']=='beijing')].sort_values(by = ['age'])

Unnamed: 0_level_0,id,city,category,age,price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-02,1001,beijing,100-A,23,1200.0
2013-01-07,1006,beijing,130-F,32,4432.0


In [67]:
# Filter and Sum
df.loc[(df['city']=='beijing')].sort_values(by = ['age']).price.sum()

5632.0

In [70]:
# Filter and Count
df.loc[(df['city']=='beijing')].sort_values(by = ['age']).city.count()

2

## Filter by query

In [73]:
df.query("city == ['shanghai','beijing']").price.sum()

5632.0