In [1]:
import pandas as pd

# Data Indexing and Selection

# Data Selection in Series

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
index=['a', 'b', 'c', 'd'])

In [3]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [4]:
data['b']

0.5

In [5]:
#We can also use dictionary-like Python expressions and methods to examine the
#keys/indices and values

In [6]:
'a' in data

True

In [7]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [9]:
data['e']=120

In [10]:
data

a      0.25
b      0.50
c      0.75
d      1.00
e    120.00
dtype: float64

In [11]:
# slicing by explicit index
data['a':'d']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [12]:
# slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [13]:
# masking

In [15]:
data[(data>0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

In [16]:
# fancy indexing
data[['a','e','c']]

a      0.25
e    120.00
c      0.75
dtype: float64

In [17]:
#Note:slicing may be the source of the most confusion. Notice that when you
#are slicing with an explicit index (i.e., data['a':'c'] ), the final index is included in
#the slice, while when you’re slicing with an implicit index (i.e., data[0:2] ), the final
#index is excluded from the slice.

# Indexers: loc, iloc, and ix

In [18]:
#These slicing and indexing conventions can be a source of confusion. For example, if
#your Series has an explicit integer index, an indexing operation such as data[1] will
#use the explicit indices, while a slicing operation like data[1:3] will use the implicit
#Python-style index.

In [19]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [20]:
data[1]

'a'

In [21]:
data[1:3]

3    b
5    c
dtype: object

In [22]:
#Because of this potential confusion in the case of integer indexes, Pandas provides
#some special indexer attributes that explicitly expose certain indexing schemes. These
#are not functional methods, but attributes that expose a particular slicing interface to
#the data in the Series .

# First, the loc attribute allows indexing and slicing that always references the explicit index

In [25]:
data

1    a
3    b
5    c
dtype: object

In [26]:
data.loc[1]

'a'

In [27]:
data.loc[1:3]

1    a
3    b
dtype: object

# The iloc attribute allows indexing and slicing that always references the implicit Python-style index

In [29]:
data

1    a
3    b
5    c
dtype: object

In [30]:
data.iloc[1]

'b'

In [31]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [32]:
#A third indexing attribute, ix , is a hybrid of the two, and for Series objects is equiva‐
#lent to standard [] -based indexing. The purpose of the ix indexer will become more
#apparent in the context of DataFrame objects

# Data Selection in DataFrame

# DataFrame as a dictionary

In [34]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [35]:
#The individual Series that make up the columns of the DataFrame can be accessed
#via dictionary-style indexing of the column name

In [36]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [37]:
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [38]:
#Equivalently, we can use attribute-style access with column names that are strings

In [39]:
data.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [40]:
#This attribute-style column access actually accesses the exact same object as the
#dictionary-style access:

In [41]:
data.area is data['area']

True

In [42]:
#Though this is a useful shorthand, keep in mind that it does not work for all cases!
#For example, if the column names are not strings, or if the column names conflict
#with methods of the DataFrame , this attribute-style access is not possible. For exam‐
#ple, the DataFrame has a pop() method, so data.pop will point to this rather than the
#"pop" column:

In [43]:
data.pop is data['pop']

False

In [44]:
data['density']=data['pop']/data['area']

In [45]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


# DataFrame as two-dimensional array

In [46]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [47]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [48]:
#we can transpose the full DataFrame to swap rows and
#columns

In [49]:
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [50]:
#When it comes to indexing of DataFrame objects, however, it is clear that the
#dictionary-style indexing of columns precludes our ability to simply treat it as a
#NumPy array. In particular, passing a single index to an array accesses a row

In [53]:
data.values[0] #row

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [54]:
data['area'] #column

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [55]:
#Pandas again uses
#the loc , iloc , and ix indexers mentioned earlier. Using the iloc indexer, we can
#index the underlying array as if it is a simple NumPy array (using the implicit
#Python-style index), but the DataFrame index and column labels are maintained in
#the result

In [56]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [57]:
data.loc[:'Florida',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860


In [61]:
#In the loc indexer we can combine masking and fancy indexing

In [62]:
data.loc[data.density>100 ,['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [63]:
#Any of these indexing conventions may also be used to set or modify values; this is
#done in the standard way that you might be accustomed to from working with
#NumPy

In [66]:
data.iloc[0,2]=10

In [67]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,10.0
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


# Additional indexing conventions

In [68]:
#There are a couple extra indexing conventions that might seem at odds with the pre‐
#ceding discussion, but nevertheless can be very useful in practice. First, while index‐
#ing refers to columns, slicing refers to rows

In [69]:
data['New York':'Illinois']

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [70]:
#Such slices can also refer to rows by number rather than by index

In [71]:
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746


In [72]:
#Similarly, direct masking operations are also interpreted row-wise rather than
#column-wise

In [74]:
data[data.density>100]

Unnamed: 0,area,pop,density
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121


# Thank You