In [1]:
import pandas as pd
import numpy as np

In [2]:
from pandas import Series, DataFrame

In [3]:
kakao = Series([100, 200])
print(kakao)

0    100
1    200
dtype: int64


In [4]:
raw_data = {'col0' : [1, 2, 3, 4],
            'col1' : [10, 20, 30, 40],
            'col2' : [100, 200, 300, 400]}
data = DataFrame(raw_data)
print(data)

   col0  col1  col2
0     1    10   100
1     2    20   200
2     3    30   300
3     4    40   400


In [5]:
date = ['16.02.29', '16.02.26', '16.02.23', '16.02.27']
data1 = DataFrame(raw_data, index=date)
print(data1)

          col0  col1  col2
16.02.29     1    10   100
16.02.26     2    20   200
16.02.23     3    30   300
16.02.27     4    40   400


In [6]:
day_data1 = data1.loc['16.02.29']
print(day_data1)

col0      1
col1     10
col2    100
Name: 16.02.29, dtype: int64


In [7]:
col1 = data1['col1']
print(col1)

16.02.29    10
16.02.26    20
16.02.23    30
16.02.27    40
Name: col1, dtype: int64


In [8]:
print(data1.columns)
print(data1.index)

Index(['col0', 'col1', 'col2'], dtype='object')
Index(['16.02.29', '16.02.26', '16.02.23', '16.02.27'], dtype='object')


In [9]:
data2 = DataFrame(data1, columns = ['Samung', 'Lg', 'Lotte'])
print(data2)

          Samung  Lg  Lotte
16.02.29     NaN NaN    NaN
16.02.26     NaN NaN    NaN
16.02.23     NaN NaN    NaN
16.02.27     NaN NaN    NaN


## Indexing and selecting data
[User guide](https://pandas.pydata.org/docs/user_guide/indexing.html)

Getting values from an object with multi-axes selection uses the following notation (using `.loc` as an example, but the following applies to `.iloc` as well). Any of the axes accessors may be the null slice `:`. Axes left out of the specification are assumed to be `:`, e.g. `p.loc['a']` is equivalent to `p.loc['a', :, :]`.

| Object Type | Indexers                             |
| :---------- | :----------------------------------- |
| Series      | `s.loc[indexer]`                     |
| DataFrame   | `df.loc[row_indexer,column_indexer]` |



As mentioned when introducing the data structures in the [last section](https://pandas.pydata.org/docs/user_guide/basics.html#basics), the primary function of indexing with `[]` (a.k.a. `__getitem__` for those familiar with implementing class behavior in Python) is selecting out lower-dimensional slices. The following table shows return type values when indexing pandas objects with `[]`:

| Object Type | Selection        | Return Value Type                 |
| :---------- | :--------------- | :-------------------------------- |
| Series      | `series[label]`  | scalar value                      |
| DataFrame   | `frame[colname]` | `Series` corresponding to colname |

In [10]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,1.385499,0.179743,0.657237,0.385064
2000-01-02,1.490875,-1.062634,-0.793167,-0.519884
2000-01-03,-0.600425,-0.494038,-0.185138,1.084396
2000-01-04,0.630187,0.12875,-0.462781,1.644034
2000-01-05,-0.285623,-0.767879,0.81075,-0.239719
2000-01-06,-1.004565,0.647528,-0.525881,2.021433
2000-01-07,-0.23343,0.888422,0.279891,-0.348496
2000-01-08,-1.24654,0.05162,-1.340524,-1.471125


In [11]:
s = df['A']
s[dates[5]]

-1.0045647832806628

In [12]:
df[['B', 'A']] = df[['A', 'B']]
df

Unnamed: 0,A,B,C,D
2000-01-01,0.179743,1.385499,0.657237,0.385064
2000-01-02,-1.062634,1.490875,-0.793167,-0.519884
2000-01-03,-0.494038,-0.600425,-0.185138,1.084396
2000-01-04,0.12875,0.630187,-0.462781,1.644034
2000-01-05,-0.767879,-0.285623,0.81075,-0.239719
2000-01-06,0.647528,-1.004565,-0.525881,2.021433
2000-01-07,0.888422,-0.23343,0.279891,-0.348496
2000-01-08,0.05162,-1.24654,-1.340524,-1.471125


In [13]:
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,0.179743,1.385499
2000-01-02,-1.062634,1.490875
2000-01-03,-0.494038,-0.600425
2000-01-04,0.12875,0.630187
2000-01-05,-0.767879,-0.285623
2000-01-06,0.647528,-1.004565
2000-01-07,0.888422,-0.23343
2000-01-08,0.05162,-1.24654


In [14]:
df.loc[:, ['B', 'A']] = df[['A', 'B']]
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,0.179743,1.385499
2000-01-02,-1.062634,1.490875
2000-01-03,-0.494038,-0.600425
2000-01-04,0.12875,0.630187
2000-01-05,-0.767879,-0.285623
2000-01-06,0.647528,-1.004565
2000-01-07,0.888422,-0.23343
2000-01-08,0.05162,-1.24654


pandas aligns all AXES when setting `Series` and `DataFrame` from `.loc`, and `.iloc`.

This will not modify `df` because the column alignment is before value assignment.

### Attribute access
You may access an index on a `Series` or column on a `DataFrame` directly as an attribute:

In [15]:
sa = pd.Series([1, 2, 3], index=list('abc'))
dfa = df.copy()

In [16]:
sa.b

2

In [17]:
dfa.A

2000-01-01    0.179743
2000-01-02   -1.062634
2000-01-03   -0.494038
2000-01-04    0.128750
2000-01-05   -0.767879
2000-01-06    0.647528
2000-01-07    0.888422
2000-01-08    0.051620
Freq: D, Name: A, dtype: float64

In [18]:
sa.a = 5
sa

a    5
b    2
c    3
dtype: int64

In [19]:
dfa.A = list(range(len(dfa.index)))   # ok if A already exists
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,1.385499,0.657237,0.385064
2000-01-02,1,1.490875,-0.793167,-0.519884
2000-01-03,2,-0.600425,-0.185138,1.084396
2000-01-04,3,0.630187,-0.462781,1.644034
2000-01-05,4,-0.285623,0.81075,-0.239719
2000-01-06,5,-1.004565,-0.525881,2.021433
2000-01-07,6,-0.23343,0.279891,-0.348496
2000-01-08,7,-1.24654,-1.340524,-1.471125


In [20]:
dfa['F'] = list(range(len(dfa.index)))  # use this form to create a new column
dfa

Unnamed: 0,A,B,C,D,F
2000-01-01,0,1.385499,0.657237,0.385064,0
2000-01-02,1,1.490875,-0.793167,-0.519884,1
2000-01-03,2,-0.600425,-0.185138,1.084396,2
2000-01-04,3,0.630187,-0.462781,1.644034,3
2000-01-05,4,-0.285623,0.81075,-0.239719,4
2000-01-06,5,-1.004565,-0.525881,2.021433,5
2000-01-07,6,-0.23343,0.279891,-0.348496,6
2000-01-08,7,-1.24654,-1.340524,-1.471125,7


You can also assign a `dict` to a row of a `DataFrame`:

In [21]:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
x.iloc[1] = {'x': 9, 'y': 99}
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


You can use attribute access to modify an existing element of a `Series` or column of a `DataFrame`, but be careful; if you try to use attribute access to create a new column, it creates a new attribute rather than a new column. In 0.21.0 and later, this will raise a UserWarning:

In [22]:
dfsimple = pd.DataFrame({'one': [1., 2., 3.]})
dfsimple.two = [4, 5, 6]

  dfsimple.two = [4, 5, 6]


In [23]:
dfsimple

Unnamed: 0,one
0,1.0
1,2.0
2,3.0


In [24]:
dfsimple.loc[:, 'two'] = [4, 5, 6]
dfsimple

Unnamed: 0,one,two
0,1.0,4
1,2.0,5
2,3.0,6


### Slicing ranges
The most robust and consistent way of slicing ranges along arbitrary axes is described in the Selection by Position section detailing the `.iloc` method. For now, we explain the semantics of slicing using the `[]` operator.

With Series, the syntax works exactly as with an `ndarray`, returning a slice of the values and the corresponding labels:

In [25]:
s[:5]

2000-01-01    0.179743
2000-01-02   -1.062634
2000-01-03   -0.494038
2000-01-04    0.128750
2000-01-05   -0.767879
Freq: D, Name: A, dtype: float64

In [26]:
s[::2]

2000-01-01    0.179743
2000-01-03   -0.494038
2000-01-05   -0.767879
2000-01-07    0.888422
Freq: 2D, Name: A, dtype: float64

In [27]:
s[::-1]

2000-01-08    0.051620
2000-01-07    0.888422
2000-01-06    0.647528
2000-01-05   -0.767879
2000-01-04    0.128750
2000-01-03   -0.494038
2000-01-02   -1.062634
2000-01-01    0.179743
Freq: -1D, Name: A, dtype: float64

In [28]:
s2 = s.copy()
s2[:5] = 0
s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06    0.647528
2000-01-07    0.888422
2000-01-08    0.051620
Freq: D, Name: A, dtype: float64

In [29]:
s

2000-01-01    0.179743
2000-01-02   -1.062634
2000-01-03   -0.494038
2000-01-04    0.128750
2000-01-05   -0.767879
2000-01-06    0.647528
2000-01-07    0.888422
2000-01-08    0.051620
Freq: D, Name: A, dtype: float64

In [30]:
df[1:3] # equals to df.iloc[1:3], however, df.loc[1:3] is an error

Unnamed: 0,A,B,C,D
2000-01-02,-1.062634,1.490875,-0.793167,-0.519884
2000-01-03,-0.494038,-0.600425,-0.185138,1.084396


In [31]:
df.loc[:,'A':'B'] # df[:,'A':'B'] error

Unnamed: 0,A,B
2000-01-01,0.179743,1.385499
2000-01-02,-1.062634,1.490875
2000-01-03,-0.494038,-0.600425
2000-01-04,0.12875,0.630187
2000-01-05,-0.767879,-0.285623
2000-01-06,0.647528,-1.004565
2000-01-07,0.888422,-0.23343
2000-01-08,0.05162,-1.24654


### Selection by label

Whether a copy or a reference is returned for a setting operation, may depend on the context. This is sometimes called `chained assignment` and should be avoided. See [Returning a View versus Copy](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-view-versus-copy).

`.loc` is strict when you present slicers that are not compatible (or convertible) with the index type. For example using integers in a `DatetimeIndex`. These will raise a `TypeError`.

In [32]:
df1 = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=pd.date_range('20130101', periods=5))
df1

Unnamed: 0,A,B,C,D
2013-01-01,-0.808132,0.096742,-0.869192,0.222385
2013-01-02,0.786489,-0.730838,0.570493,0.198743
2013-01-03,-0.046694,1.357539,1.932572,0.709
2013-01-04,-1.088369,-0.655002,0.77419,-1.932454
2013-01-05,-0.440615,-0.015546,-0.539434,0.39329


In [33]:
#df1.loc[2:3] # type error

In [34]:
df1.loc['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.786489,-0.730838,0.570493,0.198743
2013-01-03,-0.046694,1.357539,1.932572,0.709
2013-01-04,-1.088369,-0.655002,0.77419,-1.932454


String likes in slicing can be convertible to the type of the index and lead to natural slicing.

pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. Every label asked for must be in the index, or a `KeyError` will be raised. When slicing, both the start bound **AND** the stop bound are *included*, if present in the index. Integers are valid labels, but they refer to the label **and not the position**.

The `.loc` attribute is the primary access method. The following are valid inputs:

- A single label, e.g. `5` or `'a'` (Note that `5` is interpreted as a *label* of the index. This use is **not** an integer position along the index.).
- A list or array of labels `['a', 'b', 'c']`.
- A slice object with labels `'a':'f'` (Note that contrary to usual python slices, **both** the start and the stop are included, when present in the index! See [Slicing with labels](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-slicing-with-labels).
- A boolean array.
- A `callable`, see [Selection By Callable](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-callable).

#### Series

In [35]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1

a    0.028645
b   -0.418779
c    1.676830
d   -0.661134
e    0.264905
f    0.324051
dtype: float64

In [36]:
s1.loc['c':]

c    1.676830
d   -0.661134
e    0.264905
f    0.324051
dtype: float64

In [37]:
s1.loc['b']

-0.4187785164967525

In [38]:
s1.loc['c':] = 0 # setting works as well
s1

a    0.028645
b   -0.418779
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

#### DataFrame

In [39]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'), columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-0.975321,-0.041186,0.828947,-1.62022
b,1.736523,-0.40238,0.359824,0.46128
c,0.495238,-0.355306,-0.229614,-1.253922
d,1.639327,-0.098865,0.640208,-1.833469
e,1.740794,0.946087,-0.550159,0.662434
f,-0.715163,0.085444,-1.47415,-0.862083


In [40]:
df1.loc[['a', 'b', 'd'], :]

Unnamed: 0,A,B,C,D
a,-0.975321,-0.041186,0.828947,-1.62022
b,1.736523,-0.40238,0.359824,0.46128
d,1.639327,-0.098865,0.640208,-1.833469


#### Accessing via label slices

In [41]:
df1.loc['d':, 'A':'C']

Unnamed: 0,A,B,C
d,1.639327,-0.098865,0.640208
e,1.740794,0.946087,-0.550159
f,-0.715163,0.085444,-1.47415


##### For getting a cross section using a label (equivalent to `df.xs('a')`):

In [42]:
df1.loc['a']

A   -0.975321
B   -0.041186
C    0.828947
D   -1.620220
Name: a, dtype: float64

##### For getting values with a boolean array:

In [43]:
df1.loc['a'] > 0

A    False
B    False
C     True
D    False
Name: a, dtype: bool

In [44]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,C
a,0.828947
b,0.359824
c,-0.229614
d,0.640208
e,-0.550159
f,-1.47415


#### Slicing with labels

In [45]:
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])
s.loc[3:5] # elements located between the two (including them)

3    b
2    c
5    d
dtype: object

If at least one of the two is absent, but the index is sorted, and can be compared against start and stop labels, then slicing will still work as expected, by selecting labels which rank between the two:

In [46]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [47]:
s.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object

However, if at least one of the two is absent *and* the index is not sorted, an error will be raised (since doing otherwise would be computationally expensive, as well as potentially ambiguous for mixed type indexes). For instance, in the above example, `s.loc[1:6]` would raise `KeyError`.

### Selection by position
Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are `0-based` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an `IndexError`.

The `.iloc` attribute is the primary access method. The following are valid inputs:

- An integer e.g. `5`.
- A list or array of integers `[4, 3, 0]`.
- A slice object with ints `1:7`.
- A boolean array.
- A `callable`, see [Selection By Callable](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-callable).

In [48]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1

0   -0.905174
2   -1.132629
4    0.851132
6   -1.010566
8    1.499911
dtype: float64

In [49]:
s1.iloc[:3]

0   -0.905174
2   -1.132629
4    0.851132
dtype: float64

In [50]:
s1.iloc[3]

-1.010566481651136

In [51]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list(range(0, 12, 2)), columns=list(range(0, 8, 2)))
df1

Unnamed: 0,0,2,4,6
0,-1.546894,0.825711,0.367957,0.743249
2,-0.511483,-0.549663,-1.228336,-0.340541
4,0.345026,-0.811555,-0.714994,0.761643
6,-1.066365,0.809053,-0.121392,1.428791
8,-1.179249,-0.699363,0.256289,-0.188549
10,-0.60177,0.200383,0.753605,0.73023


In [52]:
df1.iloc[:3] # select via integer slicing

Unnamed: 0,0,2,4,6
0,-1.546894,0.825711,0.367957,0.743249
2,-0.511483,-0.549663,-1.228336,-0.340541
4,0.345026,-0.811555,-0.714994,0.761643


In [53]:
df1.iloc[1:5, 2:4] # select via integer slicing

Unnamed: 0,4,6
2,-1.228336,-0.340541
4,-0.714994,0.761643
6,-0.121392,1.428791
8,0.256289,-0.188549


In [54]:
df1.iloc[[1, 3, 5], [1, 3]] # select via integer list

Unnamed: 0,2,6
2,-0.549663,-0.340541
6,0.809053,1.428791
10,0.200383,0.73023


In [55]:
df1.iloc[1, 1] # this is also equivalent to df1.iat[1,1]

-0.5496627689702008

In [56]:
df1.iloc[1] # equiv to df.xs(1)

0   -0.511483
2   -0.549663
4   -1.228336
6   -0.340541
Name: 2, dtype: float64

Out of range slice indexes are handled gracefully just as in Python/Numpy.
...

## Selection by callable
`.loc`, `.iloc`, and also `[]` indexing can accept a `callable` as indexer. The `callable` must be a function with one argument (the calling Series or DataFrame) that returns valid output for indexing.

In [57]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'), columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,0.692422,0.103638,-0.37167,-0.060064
b,-0.444296,0.051137,-0.804927,-0.319028
c,-0.221704,-0.068872,-2.112985,0.962354
d,0.448432,0.37951,-0.278311,-0.44026
e,-0.668485,-1.585028,0.813981,-2.151019
f,-0.841005,0.447735,0.558121,1.156918


In [58]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
a,0.692422,0.103638,-0.37167,-0.060064
d,0.448432,0.37951,-0.278311,-0.44026


In [59]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,0.692422,0.103638
b,-0.444296,0.051137
c,-0.221704,-0.068872
d,0.448432,0.37951
e,-0.668485,-1.585028
f,-0.841005,0.447735


In [60]:
df1.iloc[:, lambda df: [0, 1]]

Unnamed: 0,A,B
a,0.692422,0.103638
b,-0.444296,0.051137
c,-0.221704,-0.068872
d,0.448432,0.37951
e,-0.668485,-1.585028
f,-0.841005,0.447735


In [61]:
df1[lambda df: df.columns[0]]

a    0.692422
b   -0.444296
c   -0.221704
d    0.448432
e   -0.668485
f   -0.841005
Name: A, dtype: float64

## Indexing with list with missing labels is deprecated

### Reindexing
The idiomatic way to achieve selecting potentially not-found elements is via `.reindex()`.

## Selecting random samples

In [62]:
s1.sample() # return 1 row

0   -0.905174
dtype: float64

In [63]:
s1.sample(n=3) # number of rows

2   -1.132629
4    0.851132
8    1.499911
dtype: float64

In [64]:
s1.sample(frac=0.5) # ractino of the rows

8    1.499911
4    0.851132
dtype: float64

In [65]:
s1.sample(n=6, replace=True) # each row more than once allowed

8    1.499911
6   -1.010566
2   -1.132629
0   -0.905174
0   -0.905174
8    1.499911
dtype: float64

In [66]:
s = pd.Series([0, 1, 2, 3, 4, 5])
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights) # weight

3    3
2    2
5    5
dtype: int64

In [67]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]})
df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
1,8,0.4
0,9,0.5
2,7,0.1


In [68]:
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
df3.sample(n=1, axis=1) # sample columns instead of rows

Unnamed: 0,col2
0,2
1,3
2,4


In [69]:
df3.sample(n=2, random_state=2) # With a given seed, the sample will always draw the same rows.

Unnamed: 0,col1,col2
2,3,4
1,2,3


In [70]:
df3.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


## Setting with enlargement
The `.loc/[]` operations can perform enlargement when setting a non-existent key for that axis.

In [71]:
se = pd.Series([1, 2, 3])
se

0    1
1    2
2    3
dtype: int64

In [72]:
se[5] = 5
se

0    1
1    2
2    3
5    5
dtype: int64

In [73]:
dfi = pd.DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'])

In [74]:
dfi

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


##### A DataFrame can be enlarged on either axis via `.loc`.

In [75]:
dfi.loc[:, 'C'] = dfi.loc[:, 'A']
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


## Fast scalar value getting and setting
Since indexing with `[]` must handle a lot of cases (single-label access, slicing, boolean indexing, etc.), it has a bit of overhead in order to figure out what you’re asking for. If you only want to access a scalar value, the fastest way is to use the `at` and `iat` methods, which are implemented on all of the data structures.

Similarly to `loc`, `at` provides **label** based scalar lookups, while, `iat` provides **integer** based lookups analogously to `iloc`

In [76]:
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [77]:
s.iat[5]

5

In [78]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.106123,-0.242851,-0.83755,-0.830941
2000-01-02,-1.223073,0.07031,-0.111569,-1.842575
2000-01-03,0.935919,2.101385,0.167071,-0.76422
2000-01-04,0.185461,0.094697,-0.44936,-0.650443
2000-01-05,0.06279,-0.765106,-1.163104,0.730365
2000-01-06,0.515753,0.234497,-0.6621,-0.513891
2000-01-07,-1.317184,1.383281,-0.437798,0.066571
2000-01-08,0.689968,-2.497472,0.594161,0.697898


In [79]:
df.at[dates[5], 'A']

0.5157530295822756

In [80]:
df.at[dates[5], 'E'] = 7
df.iat[3, 0] = 7

`at` may enlarge the object in-place as above if the indexer is missing.

In [81]:
df.at[dates[-1] + pd.Timedelta('1 day'), 0] = 7
df

Unnamed: 0,A,B,C,D,E,0
2000-01-01,-0.106123,-0.242851,-0.83755,-0.830941,,
2000-01-02,-1.223073,0.07031,-0.111569,-1.842575,,
2000-01-03,0.935919,2.101385,0.167071,-0.76422,,
2000-01-04,7.0,0.094697,-0.44936,-0.650443,,
2000-01-05,0.06279,-0.765106,-1.163104,0.730365,,
2000-01-06,0.515753,0.234497,-0.6621,-0.513891,7.0,
2000-01-07,-1.317184,1.383281,-0.437798,0.066571,,
2000-01-08,0.689968,-2.497472,0.594161,0.697898,,
2000-01-09,,,,,,7.0


## Boolean indexing
Another common operation is the use of boolean vectors to filter the data. The operators are: `|` for `or`, `&` for `and`, and `~` for `not`. These **must** be grouped by using parentheses, since by default Python will evaluate an expression such as `df['A'] > 2 & df['B'] < 3` as `df['A'] > (2 & df['B']) < 3`, while the desired evaluation order is `(df['A'] > 2) & (df['B'] < 3)`.

In [82]:
s = pd.Series(range(-3, 4))
s

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [83]:
s[s > 0]

4    1
5    2
6    3
dtype: int64

In [84]:
s[(s < -1) | (s > 0.5)]

0   -3
1   -2
4    1
5    2
6    3
dtype: int64

In [85]:
s[~(s < 0)]

3    0
4    1
5    2
6    3
dtype: int64

In [86]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D,E,0
2000-01-03,0.935919,2.101385,0.167071,-0.76422,,
2000-01-04,7.0,0.094697,-0.44936,-0.650443,,
2000-01-05,0.06279,-0.765106,-1.163104,0.730365,,
2000-01-06,0.515753,0.234497,-0.6621,-0.513891,7.0,
2000-01-08,0.689968,-2.497472,0.594161,0.697898,,


In [87]:
df2 = pd.DataFrame({
    'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'], 
    'c': np.random.randn(7)
    })
df2

Unnamed: 0,a,b,c
0,one,x,0.571531
1,one,y,-0.200323
2,two,y,1.010745
3,three,x,-0.369077
4,two,y,0.750285
5,one,x,-0.951196
6,six,x,1.563665


##### `map`

In [88]:
criterion = df2['a'].map(lambda x: x.startswith('t'))
df2[criterion]

Unnamed: 0,a,b,c
2,two,y,1.010745
3,three,x,-0.369077
4,two,y,0.750285


In [89]:
df2[[x.startswith('t') for x in df2['a']]] # equivalent but slower

Unnamed: 0,a,b,c
2,two,y,1.010745
3,three,x,-0.369077
4,two,y,0.750285


In [90]:
df2[criterion & (df2['b'] == 'x')]

Unnamed: 0,a,b,c
3,three,x,-0.369077


In [91]:
df2.loc[criterion & (df2['b'] == 'x'), 'b':'c']

Unnamed: 0,b,c
3,x,-0.369077


## Indexing with isin

### Series
Consider the [`isin()`](https://pandas.pydata.org/docs/reference/api/pandas.Series.isin.html#pandas.Series.isin) method of `Series`, which returns a boolean vector that is true wherever the `Series` elements exist in the passed list. This allows you to select rows where one or more columns have values you want

In [92]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [93]:
s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [94]:
s[s.isin([2, 4, 6])]

2    2
0    4
dtype: int64

The same method is available for `Index` objects and is useful for the cases when you don’t know which of the sought labels are in fact present:

In [95]:
s[s.index.isin([2, 4, 6])]

4    0
2    2
dtype: int64

In [96]:
s.reindex([2, 4, 6]) # compare

2    2.0
4    0.0
6    NaN
dtype: float64

In addition to that, `MultiIndex` allows selecting a separate level to use in the membership check:

In [97]:
s_mi = pd.Series(np.arange(6), index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int32

In [98]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]

0  c    2
1  a    3
dtype: int32

In [99]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int32

### DataFrame
DataFrame also has an [`isin()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html#pandas.DataFrame.isin) method. When calling `isin`, pass a set of values as either an **array** or **dict**. If values is an array, `isin` returns a DataFrame of booleans that is the same shape as the original DataFrame, with True wherever the element is in the sequence of values.

In [100]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], 'ids2': ['a', 'n', 'c', 'n']})
values = ['a', 'b', 1, 3]
df

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c
3,4,n,n


In [101]:
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,False
3,False,False,False


In [102]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,False
1,False,True,False
2,True,False,False
3,False,False,False


Combine DataFrame’s `isin` with the `any()` and `all()` methods to quickly select subsets of your data that meet a given criteria. To select a row where each column meets its own criterion:

In [103]:
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
pre_row_mask = df.isin(values)
df[pre_row_mask]

Unnamed: 0,vals,ids,ids2
0,1.0,a,a
1,,b,
2,3.0,,c
3,,,


In [104]:
row_mask = pre_row_mask.all(1) # 1 indicates the row axis, 0 the column axis
row_mask

0     True
1    False
2    False
3    False
dtype: bool

In [105]:
df[row_mask]

Unnamed: 0,vals,ids,ids2
0,1,a,a


## The `where()` Method and Masking

Selecting values from a Series with a boolean vector generally returns a subset of the data. To guarantee that selection output has the same shape as the original data, you can use the `where` method in `Series` and `DataFrame`.

To return only the selected rows:

In [106]:
s[s > 0]

3    1
2    2
1    3
0    4
dtype: int64

In [107]:
s.where(s > 0)

4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [108]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,-0.505924,0.933897,1.221537,-1.018204
2000-01-02,-1.269714,-0.718217,1.133062,0.610536
2000-01-03,-0.426731,-0.91961,0.142612,-1.664802
2000-01-04,-1.063585,-0.902131,-0.699961,0.068319
2000-01-05,1.154505,-0.378079,-0.388031,1.4052
2000-01-06,0.489226,0.611077,-0.664266,1.122127
2000-01-07,-1.488729,0.443886,1.277311,0.72642
2000-01-08,-0.370684,-0.373328,1.202547,-0.18


In [109]:
df[df < 0]

Unnamed: 0,A,B,C,D
2000-01-01,-0.505924,,,-1.018204
2000-01-02,-1.269714,-0.718217,,
2000-01-03,-0.426731,-0.91961,,-1.664802
2000-01-04,-1.063585,-0.902131,-0.699961,
2000-01-05,,-0.378079,-0.388031,
2000-01-06,,,-0.664266,
2000-01-07,-1.488729,,,
2000-01-08,-0.370684,-0.373328,,-0.18


In addition, `where` takes an optional `other` argument for replacement of values where the condition is False, in the returned copy.

In [110]:
df.where(df < 0, -df)

Unnamed: 0,A,B,C,D
2000-01-01,-0.505924,-0.933897,-1.221537,-1.018204
2000-01-02,-1.269714,-0.718217,-1.133062,-0.610536
2000-01-03,-0.426731,-0.91961,-0.142612,-1.664802
2000-01-04,-1.063585,-0.902131,-0.699961,-0.068319
2000-01-05,-1.154505,-0.378079,-0.388031,-1.4052
2000-01-06,-0.489226,-0.611077,-0.664266,-1.122127
2000-01-07,-1.488729,-0.443886,-1.277311,-0.72642
2000-01-08,-0.370684,-0.373328,-1.202547,-0.18


You may wish to set values based on some boolean criteria. This can be done intuitively like so:

In [111]:
s2 = s.copy()
s2[s2 < 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [112]:
df2 = df.copy()
df2[df2 < 0] = 0
df2

Unnamed: 0,A,B,C,D
2000-01-01,0.0,0.933897,1.221537,0.0
2000-01-02,0.0,0.0,1.133062,0.610536
2000-01-03,0.0,0.0,0.142612,0.0
2000-01-04,0.0,0.0,0.0,0.068319
2000-01-05,1.154505,0.0,0.0,1.4052
2000-01-06,0.489226,0.611077,0.0,1.122127
2000-01-07,0.0,0.443886,1.277311,0.72642
2000-01-08,0.0,0.0,1.202547,0.0


By default, `where` returns a modified copy of the data. There is an optional parameter `inplace` so that the original data can be modified without creating a copy:

In [113]:
df_orig = df.copy()
df_orig.where(df > 0, -df, inplace=True)
df_orig

Unnamed: 0,A,B,C,D
2000-01-01,0.505924,0.933897,1.221537,1.018204
2000-01-02,1.269714,0.718217,1.133062,0.610536
2000-01-03,0.426731,0.91961,0.142612,1.664802
2000-01-04,1.063585,0.902131,0.699961,0.068319
2000-01-05,1.154505,0.378079,0.388031,1.4052
2000-01-06,0.489226,0.611077,0.664266,1.122127
2000-01-07,1.488729,0.443886,1.277311,0.72642
2000-01-08,0.370684,0.373328,1.202547,0.18


The signature for [`DataFrame.where()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.where.html#pandas.DataFrame.where) differs from [`numpy.where()`](https://numpy.org/doc/stable/reference/generated/numpy.where.html#numpy.where). Roughly `df1.where(m, df2)` is equivalent to `np.where(m, df1, df2)`.

In [114]:
df.where(df < 0, -df) == np.where(df < 0, df, -df)

Unnamed: 0,A,B,C,D
2000-01-01,True,True,True,True
2000-01-02,True,True,True,True
2000-01-03,True,True,True,True
2000-01-04,True,True,True,True
2000-01-05,True,True,True,True
2000-01-06,True,True,True,True
2000-01-07,True,True,True,True
2000-01-08,True,True,True,True
