In [187]:
import pandas as pd
import numpy as np

In [188]:
from pandas import Series, DataFrame

In [189]:
kakao = Series([100, 200])
print(kakao)

0    100
1    200
dtype: int64


In [190]:
raw_data = {'col0' : [1, 2, 3, 4],
            'col1' : [10, 20, 30, 40],
            'col2' : [100, 200, 300, 400]}
data = DataFrame(raw_data)
print(data)

   col0  col1  col2
0     1    10   100
1     2    20   200
2     3    30   300
3     4    40   400


In [191]:
date = ['16.02.29', '16.02.26', '16.02.23', '16.02.27']
data1 = DataFrame(raw_data, index=date)
print(data1)

          col0  col1  col2
16.02.29     1    10   100
16.02.26     2    20   200
16.02.23     3    30   300
16.02.27     4    40   400


In [192]:
day_data1 = data1.loc['16.02.29']
print(day_data1)

col0      1
col1     10
col2    100
Name: 16.02.29, dtype: int64


In [193]:
col1 = data1['col1']
print(col1)

16.02.29    10
16.02.26    20
16.02.23    30
16.02.27    40
Name: col1, dtype: int64


In [194]:
print(data1.columns)
print(data1.index)

Index(['col0', 'col1', 'col2'], dtype='object')
Index(['16.02.29', '16.02.26', '16.02.23', '16.02.27'], dtype='object')


In [195]:
data2 = DataFrame(data1, columns = ['Samung', 'Lg', 'Lotte'])
print(data2)

          Samung  Lg  Lotte
16.02.29     NaN NaN    NaN
16.02.26     NaN NaN    NaN
16.02.23     NaN NaN    NaN
16.02.27     NaN NaN    NaN


## Indexing and selecting data
[User guide](https://pandas.pydata.org/docs/user_guide/indexing.html)

Getting values from an object with multi-axes selection uses the following notation (using `.loc` as an example, but the following applies to `.iloc` as well). Any of the axes accessors may be the null slice `:`. Axes left out of the specification are assumed to be `:`, e.g. `p.loc['a']` is equivalent to `p.loc['a', :, :]`.

| Object Type | Indexers                             |
| :---------- | :----------------------------------- |
| Series      | `s.loc[indexer]`                     |
| DataFrame   | `df.loc[row_indexer,column_indexer]` |



As mentioned when introducing the data structures in the [last section](https://pandas.pydata.org/docs/user_guide/basics.html#basics), the primary function of indexing with `[]` (a.k.a. `__getitem__` for those familiar with implementing class behavior in Python) is selecting out lower-dimensional slices. The following table shows return type values when indexing pandas objects with `[]`:

| Object Type | Selection        | Return Value Type                 |
| :---------- | :--------------- | :-------------------------------- |
| Series      | `series[label]`  | scalar value                      |
| DataFrame   | `frame[colname]` | `Series` corresponding to colname |

In [196]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,1.503471,-1.622848,-0.410774,1.561514
2000-01-02,-1.406371,0.163238,-0.437943,-0.021024
2000-01-03,-0.811605,-1.240301,0.986616,1.262835
2000-01-04,-0.736748,1.358963,1.584203,-2.55584
2000-01-05,3.37023,-1.53717,2.938057,-0.135548
2000-01-06,1.695878,0.942871,0.565014,-0.874478
2000-01-07,0.217197,0.44756,-1.909422,0.419388
2000-01-08,-0.818211,-0.339447,-0.132514,0.519776


In [197]:
s = df['A']
s[dates[5]]

1.6958775832143007

In [198]:
df[['B', 'A']] = df[['A', 'B']]
df

Unnamed: 0,A,B,C,D
2000-01-01,-1.622848,1.503471,-0.410774,1.561514
2000-01-02,0.163238,-1.406371,-0.437943,-0.021024
2000-01-03,-1.240301,-0.811605,0.986616,1.262835
2000-01-04,1.358963,-0.736748,1.584203,-2.55584
2000-01-05,-1.53717,3.37023,2.938057,-0.135548
2000-01-06,0.942871,1.695878,0.565014,-0.874478
2000-01-07,0.44756,0.217197,-1.909422,0.419388
2000-01-08,-0.339447,-0.818211,-0.132514,0.519776


In [199]:
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,-1.622848,1.503471
2000-01-02,0.163238,-1.406371
2000-01-03,-1.240301,-0.811605
2000-01-04,1.358963,-0.736748
2000-01-05,-1.53717,3.37023
2000-01-06,0.942871,1.695878
2000-01-07,0.44756,0.217197
2000-01-08,-0.339447,-0.818211


In [200]:
df.loc[:, ['B', 'A']] = df[['A', 'B']]
df[['A', 'B']]

Unnamed: 0,A,B
2000-01-01,-1.622848,1.503471
2000-01-02,0.163238,-1.406371
2000-01-03,-1.240301,-0.811605
2000-01-04,1.358963,-0.736748
2000-01-05,-1.53717,3.37023
2000-01-06,0.942871,1.695878
2000-01-07,0.44756,0.217197
2000-01-08,-0.339447,-0.818211


pandas aligns all AXES when setting `Series` and `DataFrame` from `.loc`, and `.iloc`.

This will not modify `df` because the column alignment is before value assignment.

### Attribute access
You may access an index on a `Series` or column on a `DataFrame` directly as an attribute:

In [201]:
sa = pd.Series([1, 2, 3], index=list('abc'))
dfa = df.copy()

In [202]:
sa.b

2

In [203]:
dfa.A

2000-01-01   -1.622848
2000-01-02    0.163238
2000-01-03   -1.240301
2000-01-04    1.358963
2000-01-05   -1.537170
2000-01-06    0.942871
2000-01-07    0.447560
2000-01-08   -0.339447
Freq: D, Name: A, dtype: float64

In [204]:
sa.a = 5
sa

a    5
b    2
c    3
dtype: int64

In [205]:
dfa.A = list(range(len(dfa.index)))   # ok if A already exists
dfa

Unnamed: 0,A,B,C,D
2000-01-01,0,1.503471,-0.410774,1.561514
2000-01-02,1,-1.406371,-0.437943,-0.021024
2000-01-03,2,-0.811605,0.986616,1.262835
2000-01-04,3,-0.736748,1.584203,-2.55584
2000-01-05,4,3.37023,2.938057,-0.135548
2000-01-06,5,1.695878,0.565014,-0.874478
2000-01-07,6,0.217197,-1.909422,0.419388
2000-01-08,7,-0.818211,-0.132514,0.519776


In [206]:
dfa['F'] = list(range(len(dfa.index)))  # use this form to create a new column
dfa

Unnamed: 0,A,B,C,D,F
2000-01-01,0,1.503471,-0.410774,1.561514,0
2000-01-02,1,-1.406371,-0.437943,-0.021024,1
2000-01-03,2,-0.811605,0.986616,1.262835,2
2000-01-04,3,-0.736748,1.584203,-2.55584,3
2000-01-05,4,3.37023,2.938057,-0.135548,4
2000-01-06,5,1.695878,0.565014,-0.874478,5
2000-01-07,6,0.217197,-1.909422,0.419388,6
2000-01-08,7,-0.818211,-0.132514,0.519776,7


You can also assign a `dict` to a row of a `DataFrame`:

In [207]:
x = pd.DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
x.iloc[1] = {'x': 9, 'y': 99}
x

Unnamed: 0,x,y
0,1,3
1,9,99
2,3,5


You can use attribute access to modify an existing element of a `Series` or column of a `DataFrame`, but be careful; if you try to use attribute access to create a new column, it creates a new attribute rather than a new column. In 0.21.0 and later, this will raise a UserWarning:

In [208]:
dfsimple = pd.DataFrame({'one': [1., 2., 3.]})
dfsimple.two = [4, 5, 6]

  dfsimple.two = [4, 5, 6]


In [209]:
dfsimple

Unnamed: 0,one
0,1.0
1,2.0
2,3.0


In [210]:
dfsimple.loc[:, 'two'] = [4, 5, 6]
dfsimple

Unnamed: 0,one,two
0,1.0,4
1,2.0,5
2,3.0,6


### Slicing ranges
The most robust and consistent way of slicing ranges along arbitrary axes is described in the Selection by Position section detailing the `.iloc` method. For now, we explain the semantics of slicing using the `[]` operator.

With Series, the syntax works exactly as with an `ndarray`, returning a slice of the values and the corresponding labels:

In [211]:
s[:5]

2000-01-01   -1.622848
2000-01-02    0.163238
2000-01-03   -1.240301
2000-01-04    1.358963
2000-01-05   -1.537170
Freq: D, Name: A, dtype: float64

In [212]:
s[::2]

2000-01-01   -1.622848
2000-01-03   -1.240301
2000-01-05   -1.537170
2000-01-07    0.447560
Freq: 2D, Name: A, dtype: float64

In [213]:
s[::-1]

2000-01-08   -0.339447
2000-01-07    0.447560
2000-01-06    0.942871
2000-01-05   -1.537170
2000-01-04    1.358963
2000-01-03   -1.240301
2000-01-02    0.163238
2000-01-01   -1.622848
Freq: -1D, Name: A, dtype: float64

In [214]:
s2 = s.copy()
s2[:5] = 0
s2

2000-01-01    0.000000
2000-01-02    0.000000
2000-01-03    0.000000
2000-01-04    0.000000
2000-01-05    0.000000
2000-01-06    0.942871
2000-01-07    0.447560
2000-01-08   -0.339447
Freq: D, Name: A, dtype: float64

In [215]:
s

2000-01-01   -1.622848
2000-01-02    0.163238
2000-01-03   -1.240301
2000-01-04    1.358963
2000-01-05   -1.537170
2000-01-06    0.942871
2000-01-07    0.447560
2000-01-08   -0.339447
Freq: D, Name: A, dtype: float64

### Selection by label

Whether a copy or a reference is returned for a setting operation, may depend on the context. This is sometimes called `chained assignment` and should be avoided. See [Returning a View versus Copy](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-view-versus-copy).

`.loc` is strict when you present slicers that are not compatible (or convertible) with the index type. For example using integers in a `DatetimeIndex`. These will raise a `TypeError`.

In [216]:
df1 = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), index=pd.date_range('20130101', periods=5))
df1

Unnamed: 0,A,B,C,D
2013-01-01,0.298727,-0.518083,2.158221,-0.530204
2013-01-02,1.69365,-0.440852,0.114095,-1.636239
2013-01-03,0.371212,-0.046117,2.060849,2.114522
2013-01-04,-1.463915,-1.409276,0.695796,0.002127
2013-01-05,2.785696,0.035994,0.258815,0.822538


In [217]:
#df1.loc[2:3] # type error

In [218]:
df1.loc['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.69365,-0.440852,0.114095,-1.636239
2013-01-03,0.371212,-0.046117,2.060849,2.114522
2013-01-04,-1.463915,-1.409276,0.695796,0.002127


String likes in slicing can be convertible to the type of the index and lead to natural slicing.

pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. Every label asked for must be in the index, or a `KeyError` will be raised. When slicing, both the start bound **AND** the stop bound are *included*, if present in the index. Integers are valid labels, but they refer to the label **and not the position**.

The `.loc` attribute is the primary access method. The following are valid inputs:

- A single label, e.g. `5` or `'a'` (Note that `5` is interpreted as a *label* of the index. This use is **not** an integer position along the index.).
- A list or array of labels `['a', 'b', 'c']`.
- A slice object with labels `'a':'f'` (Note that contrary to usual python slices, **both** the start and the stop are included, when present in the index! See [Slicing with labels](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-slicing-with-labels).
- A boolean array.
- A `callable`, see [Selection By Callable](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-callable).

#### Series

In [219]:
s1 = pd.Series(np.random.randn(6), index=list('abcdef'))
s1

a    0.788124
b    0.305227
c    1.434415
d    1.394942
e   -0.591324
f   -0.954788
dtype: float64

In [220]:
s1.loc['c':]

c    1.434415
d    1.394942
e   -0.591324
f   -0.954788
dtype: float64

In [221]:
s1.loc['b']

0.30522680342917474

In [222]:
s1.loc['c':] = 0 # setting works as well
s1

a    0.788124
b    0.305227
c    0.000000
d    0.000000
e    0.000000
f    0.000000
dtype: float64

#### DataFrame

In [223]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'), columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-0.151665,-0.351518,0.483486,-0.084406
b,0.674094,1.015822,0.396085,0.16648
c,0.06236,0.252784,0.018614,-0.043452
d,0.686979,-0.252599,0.289425,0.235767
e,0.589179,-0.486538,-0.78431,-0.162661
f,0.320083,-1.657933,1.872621,0.685124


In [224]:
df1.loc[['a', 'b', 'd'], :]

Unnamed: 0,A,B,C,D
a,-0.151665,-0.351518,0.483486,-0.084406
b,0.674094,1.015822,0.396085,0.16648
d,0.686979,-0.252599,0.289425,0.235767


#### Accessing via label slices

In [225]:
df1.loc['d':, 'A':'C']

Unnamed: 0,A,B,C
d,0.686979,-0.252599,0.289425
e,0.589179,-0.486538,-0.78431
f,0.320083,-1.657933,1.872621


##### For getting a cross section using a label (equivalent to `df.xs('a')`):

In [226]:
df1.loc['a']

A   -0.151665
B   -0.351518
C    0.483486
D   -0.084406
Name: a, dtype: float64

##### For getting values with a boolean array:

In [227]:
df1.loc['a'] > 0

A    False
B    False
C     True
D    False
Name: a, dtype: bool

In [228]:
df1.loc[:, df1.loc['a'] > 0]

Unnamed: 0,C
a,0.483486
b,0.396085
c,0.018614
d,0.289425
e,-0.78431
f,1.872621


#### Slicing with labels

In [229]:
s = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])
s.loc[3:5] # elements located between the two (including them)

3    b
2    c
5    d
dtype: object

If at least one of the two is absent, but the index is sorted, and can be compared against start and stop labels, then slicing will still work as expected, by selecting labels which rank between the two:

In [230]:
s.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [231]:
s.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object

However, if at least one of the two is absent *and* the index is not sorted, an error will be raised (since doing otherwise would be computationally expensive, as well as potentially ambiguous for mixed type indexes). For instance, in the above example, `s.loc[1:6]` would raise `KeyError`.

### Selection by position
Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are `0-based` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an `IndexError`.

The `.iloc` attribute is the primary access method. The following are valid inputs:

- An integer e.g. `5`.
- A list or array of integers `[4, 3, 0]`.
- A slice object with ints `1:7`.
- A boolean array.
- A `callable`, see [Selection By Callable](https://pandas.pydata.org/docs/user_guide/indexing.html#indexing-callable).

In [232]:
s1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
s1

0   -1.128049
2    1.345869
4   -0.364978
6   -0.860497
8    1.303500
dtype: float64

In [233]:
s1.iloc[:3]

0   -1.128049
2    1.345869
4   -0.364978
dtype: float64

In [234]:
s1.iloc[3]

-0.8604969122000619

In [235]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list(range(0, 12, 2)), columns=list(range(0, 8, 2)))
df1

Unnamed: 0,0,2,4,6
0,-0.13311,-0.925956,1.283699,1.266148
2,-0.828162,0.170544,-0.017931,0.348646
4,1.266169,0.599522,-0.517913,0.26353
6,0.695776,0.384759,-1.53522,0.975332
8,0.114575,1.74598,-1.749651,-0.942255
10,0.636716,-0.238759,0.752362,1.687683


In [236]:
df1.iloc[:3] # select via integer slicing

Unnamed: 0,0,2,4,6
0,-0.13311,-0.925956,1.283699,1.266148
2,-0.828162,0.170544,-0.017931,0.348646
4,1.266169,0.599522,-0.517913,0.26353


In [237]:
df1.iloc[1:5, 2:4] # select via integer slicing

Unnamed: 0,4,6
2,-0.017931,0.348646
4,-0.517913,0.26353
6,-1.53522,0.975332
8,-1.749651,-0.942255


In [238]:
df1.iloc[[1, 3, 5], [1, 3]] # select via integer list

Unnamed: 0,2,6
2,0.170544,0.348646
6,0.384759,0.975332
10,-0.238759,1.687683


In [239]:
df1.iloc[1, 1] # this is also equivalent to df1.iat[1,1]

0.1705435874787405

In [240]:
df1.iloc[1] # equiv to df.xs(1)

0   -0.828162
2    0.170544
4   -0.017931
6    0.348646
Name: 2, dtype: float64

Out of range slice indexes are handled gracefully just as in Python/Numpy.
...

## Selection by callable
`.loc`, `.iloc`, and also `[]` indexing can accept a `callable` as indexer. The `callable` must be a function with one argument (the calling Series or DataFrame) that returns valid output for indexing.

In [241]:
df1 = pd.DataFrame(np.random.randn(6, 4), index=list('abcdef'), columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,1.915339,-0.254338,0.338191,-1.383895
b,-0.802961,-0.964969,0.747671,0.461542
c,0.445075,-1.592439,-0.719715,0.565653
d,0.375511,1.06427,-0.796887,-1.914806
e,0.905219,0.008901,0.64532,-1.183916
f,-0.99138,-1.474367,-0.640465,-0.64344


In [242]:
df1.loc[lambda df: df['A'] > 0, :]

Unnamed: 0,A,B,C,D
a,1.915339,-0.254338,0.338191,-1.383895
c,0.445075,-1.592439,-0.719715,0.565653
d,0.375511,1.06427,-0.796887,-1.914806
e,0.905219,0.008901,0.64532,-1.183916


In [243]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,1.915339,-0.254338
b,-0.802961,-0.964969
c,0.445075,-1.592439
d,0.375511,1.06427
e,0.905219,0.008901
f,-0.99138,-1.474367


In [244]:
df1.iloc[:, lambda df: [0, 1]]

Unnamed: 0,A,B
a,1.915339,-0.254338
b,-0.802961,-0.964969
c,0.445075,-1.592439
d,0.375511,1.06427
e,0.905219,0.008901
f,-0.99138,-1.474367


In [245]:
df1[lambda df: df.columns[0]]

a    1.915339
b   -0.802961
c    0.445075
d    0.375511
e    0.905219
f   -0.991380
Name: A, dtype: float64

## Indexing with list with missing labels is deprecated

### Reindexing
The idiomatic way to achieve selecting potentially not-found elements is via `.reindex()`.

## Selecting random samples

In [246]:
s1.sample() # return 1 row

6   -0.860497
dtype: float64

In [247]:
s1.sample(n=3) # number of rows

8    1.303500
0   -1.128049
2    1.345869
dtype: float64

In [248]:
s1.sample(frac=0.5) # ractino of the rows

0   -1.128049
8    1.303500
dtype: float64

In [249]:
s1.sample(n=6, replace=True) # each row more than once allowed

6   -0.860497
8    1.303500
2    1.345869
2    1.345869
6   -0.860497
8    1.303500
dtype: float64

In [250]:
s = pd.Series([0, 1, 2, 3, 4, 5])
example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4]
s.sample(n=3, weights=example_weights) # weight

5    5
4    4
2    2
dtype: int64

In [251]:
df2 = pd.DataFrame({'col1': [9, 8, 7, 6], 'weight_column': [0.5, 0.4, 0.1, 0]})
df2.sample(n=3, weights='weight_column')

Unnamed: 0,col1,weight_column
0,9,0.5
2,7,0.1
1,8,0.4


In [252]:
df3 = pd.DataFrame({'col1': [1, 2, 3], 'col2': [2, 3, 4]})
df3.sample(n=1, axis=1) # sample columns instead of rows

Unnamed: 0,col1
0,1
1,2
2,3


In [253]:
df3.sample(n=2, random_state=2) # With a given seed, the sample will always draw the same rows.

Unnamed: 0,col1,col2
2,3,4
1,2,3


In [254]:
df3.sample(n=2, random_state=2)

Unnamed: 0,col1,col2
2,3,4
1,2,3


## Setting with enlargement
The `.loc/[]` operations can perform enlargement when setting a non-existent key for that axis.

In [255]:
se = pd.Series([1, 2, 3])
se

0    1
1    2
2    3
dtype: int64

In [256]:
se[5] = 5
se

0    1
1    2
2    3
5    5
dtype: int64

In [257]:
dfi = pd.DataFrame(np.arange(6).reshape(3, 2), columns=['A', 'B'])

In [258]:
dfi

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


##### A DataFrame can be enlarged on either axis via `.loc`.

In [259]:
dfi.loc[:, 'C'] = dfi.loc[:, 'A']
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


## Fast scalar value getting and setting
Since indexing with `[]` must handle a lot of cases (single-label access, slicing, boolean indexing, etc.), it has a bit of overhead in order to figure out what you’re asking for. If you only want to access a scalar value, the fastest way is to use the `at` and `iat` methods, which are implemented on all of the data structures.

Similarly to `loc`, `at` provides **label** based scalar lookups, while, `iat` provides **integer** based lookups analogously to `iloc`

In [260]:
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [261]:
s.iat[5]

5

In [262]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,0.380978,-1.575405,-0.913938,0.372412
2000-01-02,-0.827063,0.763635,-0.757578,-0.541201
2000-01-03,0.491724,1.296537,-0.308799,-0.858085
2000-01-04,0.064442,-0.041221,0.976649,-0.096689
2000-01-05,1.634363,2.220815,1.647491,0.00739
2000-01-06,-0.614664,0.119258,-1.375533,-1.45133
2000-01-07,-1.720498,-1.049084,-1.430366,0.202595
2000-01-08,1.858087,1.253253,0.349557,-1.841323


In [263]:
df.at[dates[5], 'A']

-0.6146640229593302

In [264]:
df.at[dates[5], 'E'] = 7
df.iat[3, 0] = 7

`at` may enlarge the object in-place as above if the indexer is missing.

In [265]:
df.at[dates[-1] + pd.Timedelta('1 day'), 0] = 7
df

Unnamed: 0,A,B,C,D,E,0
2000-01-01,0.380978,-1.575405,-0.913938,0.372412,,
2000-01-02,-0.827063,0.763635,-0.757578,-0.541201,,
2000-01-03,0.491724,1.296537,-0.308799,-0.858085,,
2000-01-04,7.0,-0.041221,0.976649,-0.096689,,
2000-01-05,1.634363,2.220815,1.647491,0.00739,,
2000-01-06,-0.614664,0.119258,-1.375533,-1.45133,7.0,
2000-01-07,-1.720498,-1.049084,-1.430366,0.202595,,
2000-01-08,1.858087,1.253253,0.349557,-1.841323,,
2000-01-09,,,,,,7.0


## Boolean indexing
Another common operation is the use of boolean vectors to filter the data. The operators are: `|` for `or`, `&` for `and`, and `~` for `not`. These **must** be grouped by using parentheses, since by default Python will evaluate an expression such as `df['A'] > 2 & df['B'] < 3` as `df['A'] > (2 & df['B']) < 3`, while the desired evaluation order is `(df['A'] > 2) & (df['B'] < 3)`.

In [266]:
s = pd.Series(range(-3, 4))
s

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64

In [267]:
s[s > 0]

4    1
5    2
6    3
dtype: int64

In [268]:
s[(s < -1) | (s > 0.5)]

0   -3
1   -2
4    1
5    2
6    3
dtype: int64

In [269]:
s[~(s < 0)]

3    0
4    1
5    2
6    3
dtype: int64

In [270]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D,E,0
2000-01-01,0.380978,-1.575405,-0.913938,0.372412,,
2000-01-03,0.491724,1.296537,-0.308799,-0.858085,,
2000-01-04,7.0,-0.041221,0.976649,-0.096689,,
2000-01-05,1.634363,2.220815,1.647491,0.00739,,
2000-01-08,1.858087,1.253253,0.349557,-1.841323,,


In [271]:
df2 = pd.DataFrame({
    'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
    'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'], 
    'c': np.random.randn(7)
    })
df2

Unnamed: 0,a,b,c
0,one,x,0.699751
1,one,y,0.266897
2,two,y,0.137782
3,three,x,-1.367913
4,two,y,0.497038
5,one,x,-0.210373
6,six,x,-1.116244


##### `map`

In [272]:
criterion = df2['a'].map(lambda x: x.startswith('t'))
df2[criterion]

Unnamed: 0,a,b,c
2,two,y,0.137782
3,three,x,-1.367913
4,two,y,0.497038


In [273]:
df2[[x.startswith('t') for x in df2['a']]] # equivalent but slower

Unnamed: 0,a,b,c
2,two,y,0.137782
3,three,x,-1.367913
4,two,y,0.497038


In [274]:
df2[criterion & (df2['b'] == 'x')]

Unnamed: 0,a,b,c
3,three,x,-1.367913


In [275]:
df2.loc[criterion & (df2['b'] == 'x'), 'b':'c']

Unnamed: 0,b,c
3,x,-1.367913


## Indexing with isin

### Series
Consider the [`isin()`](https://pandas.pydata.org/docs/reference/api/pandas.Series.isin.html#pandas.Series.isin) method of `Series`, which returns a boolean vector that is true wherever the `Series` elements exist in the passed list. This allows you to select rows where one or more columns have values you want

In [276]:
s = pd.Series(np.arange(5), index=np.arange(5)[::-1], dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [277]:
s.isin([2, 4, 6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [278]:
s[s.isin([2, 4, 6])]

2    2
0    4
dtype: int64

The same method is available for `Index` objects and is useful for the cases when you don’t know which of the sought labels are in fact present:

In [279]:
s[s.index.isin([2, 4, 6])]

4    0
2    2
dtype: int64

In [280]:
s.reindex([2, 4, 6]) # compare

2    2.0
4    0.0
6    NaN
dtype: float64

In addition to that, `MultiIndex` allows selecting a separate level to use in the membership check:

In [281]:
s_mi = pd.Series(np.arange(6), index=pd.MultiIndex.from_product([[0, 1], ['a', 'b', 'c']]))
s_mi

0  a    0
   b    1
   c    2
1  a    3
   b    4
   c    5
dtype: int32

In [282]:
s_mi.iloc[s_mi.index.isin([(1, 'a'), (2, 'b'), (0, 'c')])]

0  c    2
1  a    3
dtype: int32

In [283]:
s_mi.iloc[s_mi.index.isin(['a', 'c', 'e'], level=1)]

0  a    0
   c    2
1  a    3
   c    5
dtype: int32

### DataFrame
DataFrame also has an [`isin()`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isin.html#pandas.DataFrame.isin) method. When calling `isin`, pass a set of values as either an **array** or **dict**. If values is an array, `isin` returns a DataFrame of booleans that is the same shape as the original DataFrame, with True wherever the element is in the sequence of values.

In [284]:
df = pd.DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'], 'ids2': ['a', 'n', 'c', 'n']})
values = ['a', 'b', 1, 3]
df

Unnamed: 0,vals,ids,ids2
0,1,a,a
1,2,b,n
2,3,f,c
3,4,n,n


In [285]:
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,True
1,False,True,False
2,True,False,False
3,False,False,False


In [286]:
values = {'ids': ['a', 'b'], 'vals': [1, 3]}
df.isin(values)

Unnamed: 0,vals,ids,ids2
0,True,True,False
1,False,True,False
2,True,False,False
3,False,False,False


Combine DataFrame’s `isin` with the `any()` and `all()` methods to quickly select subsets of your data that meet a given criteria. To select a row where each column meets its own criterion:

In [287]:
values = {'ids': ['a', 'b'], 'ids2': ['a', 'c'], 'vals': [1, 3]}
pre_row_mask = df.isin(values)
df[pre_row_mask]

Unnamed: 0,vals,ids,ids2
0,1.0,a,a
1,,b,
2,3.0,,c
3,,,


In [288]:
row_mask = pre_row_mask.all(1) # 1 indicates the row axis, 0 the column axis
row_mask

0     True
1    False
2    False
3    False
dtype: bool

In [289]:
df[row_mask]

Unnamed: 0,vals,ids,ids2
0,1,a,a


## The `where()` Method and Masking

Selecting values from a Series with a boolean vector generally returns a subset of the data. To guarantee that selection output has the same shape as the original data, you can use the `where` method in `Series` and `DataFrame`.

To return only the selected rows:

In [290]:
s[s > 0]

3    1
2    2
1    3
0    4
dtype: int64

In [291]:
s.where(s > 0)

4    NaN
3    1.0
2    2.0
1    3.0
0    4.0
dtype: float64

In [293]:
dates = pd.date_range('1/1/2000', periods=8)
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,0.476236,-0.010844,1.911654,0.859734
2000-01-02,-0.427821,1.062463,-0.042227,-0.791288
2000-01-03,0.13746,0.831306,1.013059,-0.002273
2000-01-04,-1.161963,-1.295759,-0.034901,-0.720689
2000-01-05,-1.897321,-0.746535,-0.810896,-0.57405
2000-01-06,1.529619,-0.07739,0.828964,0.970553
2000-01-07,0.74333,-1.02483,-0.438879,0.156383
2000-01-08,1.404251,-0.54839,0.479488,-0.100793


In [294]:
df[df < 0]

Unnamed: 0,A,B,C,D
2000-01-01,,-0.010844,,
2000-01-02,-0.427821,,-0.042227,-0.791288
2000-01-03,,,,-0.002273
2000-01-04,-1.161963,-1.295759,-0.034901,-0.720689
2000-01-05,-1.897321,-0.746535,-0.810896,-0.57405
2000-01-06,,-0.07739,,
2000-01-07,,-1.02483,-0.438879,
2000-01-08,,-0.54839,,-0.100793


In addition, `where` takes an optional `other` argument for replacement of values where the condition is False, in the returned copy.

In [295]:
df.where(df < 0, -df)

Unnamed: 0,A,B,C,D
2000-01-01,-0.476236,-0.010844,-1.911654,-0.859734
2000-01-02,-0.427821,-1.062463,-0.042227,-0.791288
2000-01-03,-0.13746,-0.831306,-1.013059,-0.002273
2000-01-04,-1.161963,-1.295759,-0.034901,-0.720689
2000-01-05,-1.897321,-0.746535,-0.810896,-0.57405
2000-01-06,-1.529619,-0.07739,-0.828964,-0.970553
2000-01-07,-0.74333,-1.02483,-0.438879,-0.156383
2000-01-08,-1.404251,-0.54839,-0.479488,-0.100793


You may wish to set values based on some boolean criteria. This can be done intuitively like so:

In [297]:
s2 = s.copy()
s2[s2 < 0] = 0
s2

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [298]:
df2 = df.copy()
df2[df2 < 0] = 0
df2

Unnamed: 0,A,B,C,D
2000-01-01,0.476236,0.0,1.911654,0.859734
2000-01-02,0.0,1.062463,0.0,0.0
2000-01-03,0.13746,0.831306,1.013059,0.0
2000-01-04,0.0,0.0,0.0,0.0
2000-01-05,0.0,0.0,0.0,0.0
2000-01-06,1.529619,0.0,0.828964,0.970553
2000-01-07,0.74333,0.0,0.0,0.156383
2000-01-08,1.404251,0.0,0.479488,0.0


In [None]:
df_orig = df.copy()