In [13]:
import numpy as np
import pandas as pd
import random

# Data selection & Indexing

## Series

In [14]:
series = pd.Series([3, 62, 75, 83, 47, 43, 39, 16, 19, 2])

In [15]:
series

0     3
1    62
2    75
3    83
4    47
5    43
6    39
7    16
8    19
9     2
dtype: int64

### Access by Position / Slice

In [16]:
series[0]

3

In [17]:
series[3:6]

3    83
4    47
5    43
dtype: int64

In [18]:
# series[3:6]
series.iloc[3:6]
# note [] not ()!

3    83
4    47
5    43
dtype: int64

### Access by label

In [19]:
# set alpha label as new index for the series
series.index = [x for x in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"][:len(series)]

In [20]:
series

A     3
B    62
C    75
D    83
E    47
F    43
G    39
H    16
I    19
J     2
dtype: int64

In [21]:
series[3:6]
# position, pythonic

D    83
E    47
F    43
dtype: int64

In [22]:
series['D':'F']
# by label: slice includes end! 

D    83
E    47
F    43
dtype: int64

In [23]:
series[['D':'F', 'I':'J']]
# cannot combine multiple ranges

SyntaxError: invalid syntax (<ipython-input-23-a585bc35575e>, line 1)

In [24]:
pd.concat([series['D':'F'], series['I':'J']])
# concat to combine multiple ranges

D    83
E    47
F    43
I    19
J     2
dtype: int64

In [25]:
# set alpha label as new index for the series
series.index = [x for x in "GATTACAXYZ"][:len(series)]

In [26]:
series

G     3
A    62
T    75
T    83
A    47
C    43
A    39
X    16
Y    19
Z     2
dtype: int64

In [27]:
series.loc['G']

3

In [28]:
series.loc['G':'A']
# non-unique values breaks slicing

KeyError: "Cannot get right slice bound for non-unique label: 'A'"

In [29]:
series.loc['X':'Z']
# while unique values are still slicable in a non-unique index

X    16
Y    19
Z     2
dtype: int64

## DataFrames, 2D Data

In [30]:
df = pd.read_json('../data/sampledf.json')

In [31]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


In [32]:
df[2]
# column

0     21
1     89
2     31
3    100
4     83
5     73
6     18
7     22
8     89
9     36
Name: 2, dtype: int64

In [33]:
df[2:4]
# rows!

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51


In [34]:
df.iloc[2:4, 2:4]
# segment

Unnamed: 0,2,3
2,31,69
3,100,6


In [35]:
df.iloc[:, 2:4]
# column slice

Unnamed: 0,2,3
0,21,99
1,89,66
2,31,69
3,100,6
4,83,85
5,73,73
6,18,98
7,22,4
8,89,13
9,36,54


In [36]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,79,19,21,99,35,59,44,25,75,58
1,25,39,89,66,9,41,6,69,63,3
2,37,64,31,69,61,97,5,11,76,57
3,74,61,100,6,58,80,95,50,15,51
4,79,60,83,85,16,5,16,69,5,20
5,45,26,73,73,100,60,21,19,95,12
6,12,29,18,98,62,68,92,29,74,96
7,36,32,22,4,66,25,63,51,59,14
8,55,53,89,13,84,87,74,3,2,64
9,46,74,36,54,21,12,68,33,80,25


In [37]:
df.index = ["R{:02d}".format(i) for i in range(len(df))]

In [38]:
df.columns = ["C{:02d}".format(i) for i in range(len(df.columns))]

In [39]:
df

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


In [40]:
df['C05']

R00    59
R01    41
R02    97
R03    80
R04     5
R05    60
R06    68
R07    25
R08    87
R09    12
Name: C05, dtype: int64

In [41]:
df['R02':'R05']

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R02,37,64,31,69,61,97,5,11,76,57
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12


In [42]:
df.loc['R02':'R05', 'C04':'C05']
# segment

Unnamed: 0,C04,C05
R02,61,97
R03,58,80
R04,16,5
R05,100,60


## Boolean Index

A boolean index is an array of true/false values: [1, 0, 1, 1, 0, 0, 1, …]

! though the index name it's not one of the Pandas Index Types.

In [43]:
df['C04']

R00     35
R01      9
R02     61
R03     58
R04     16
R05    100
R06     62
R07     66
R08     84
R09     21
Name: C04, dtype: int64

In [44]:
df['C04'] > 60

R00    False
R01    False
R02     True
R03    False
R04    False
R05     True
R06     True
R07     True
R08     True
R09    False
Name: C04, dtype: bool

In [48]:
df[df['C04'] > 60]

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R02,37,64,31,69,61,97,5,11,76,57
R05,45,26,73,73,100,60,21,19,95,12
R06,12,29,18,98,62,68,92,29,74,96
R07,36,32,22,4,66,25,63,51,59,14
R08,55,53,89,13,84,87,74,3,2,64


In [46]:
df[(df['C04'] < 60) | (df['C04'] > 80)]  # multiple OR

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R00,79,19,21,99,35,59,44,25,75,58
R01,25,39,89,66,9,41,6,69,63,3
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
R05,45,26,73,73,100,60,21,19,95,12
R08,55,53,89,13,84,87,74,3,2,64
R09,46,74,36,54,21,12,68,33,80,25


In [47]:
df[(df['C04'] < 60) & (df['C04'] % 2 == 0)]  # multiple AND

Unnamed: 0,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09
R03,74,61,100,6,58,80,95,50,15,51
R04,79,60,83,85,16,5,16,69,5,20
