## Pandas Indexing and Subsetting

This notebooks focuses on understanding indexing and subsetting in Pandas dataframes.

1. How to define a dataframe using a dictionary.
2. How to set index and use access attributes of a dataframe.
3. How to access specific and slices of rows and columns of a dataframe.

In [1]:
import numpy as np
import pandas as pd
import random
import string

In [2]:
##create a dictionary that stores 
##students' roll number, math, physics & chem scores.

scores_dict = {
    'id': [''.join(random.choices(
        string.ascii_uppercase + string.digits, k=5)
                 ) for _ in range(30)],
    'roll': np.arange(30) + 1,
    'math_scores': np.random.randint(100, size=(30)),
    'physics_scores': np.random.randint(100, size=(30)),
    'chemistry_scores': np.random.randint(100, size=(30))
}

print(scores_dict)

{'id': ['K43QX', '20WMM', '2GU05', 'V1ZYB', 'WWNMT', '95UDH', 'S6C1N', '2TQQB', 'OH18E', '2996M', 'MP5CK', 'Y3BIK', 'CAG6L', 'BK3D5', 'O6KJA', 'A19A0', 'LTJFJ', 'FSEC9', 'ED6EN', '2KZZA', 'N1IEN', '3VJ9D', 'R0UHX', '16BSL', 'ZAWP6', '3XLCZ', '1DMW6', 'QXCRK', 'Q6FET', 'Q732V'], 'roll': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]), 'math_scores': array([ 2, 88,  0, 58, 85, 71, 69, 72, 27, 36, 58, 96,  2, 68, 57,  3, 89,
       54, 20, 10, 35, 93,  0, 66, 80,  4, 62, 48,  7, 43]), 'physics_scores': array([69, 19, 22, 56, 99, 10, 39, 28, 66, 99, 66, 97, 37, 49, 44,  0, 38,
       53, 56, 25,  3, 68, 45, 31, 49, 42, 84,  9, 40, 31]), 'chemistry_scores': array([89, 23, 52, 55,  5, 13, 29, 26, 47, 89, 64, 66, 68, 96,  1, 88, 64,
       82, 19, 55, 15, 14, 44, 46,  3,  8, 54, 72, 88, 61])}


In [3]:
##convert the scores_dict to a pandas dataframe

df = pd.DataFrame(scores_dict)
df.head()


Unnamed: 0,id,roll,math_scores,physics_scores,chemistry_scores
0,K43QX,1,2,69,89
1,20WMM,2,88,19,23
2,2GU05,3,0,22,52
3,V1ZYB,4,58,56,55
4,WWNMT,5,85,99,5


In [11]:
##make id column the index of the dataframe
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,roll,math_scores,physics_scores,chemistry_scores
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
K43QX,1,2,69,89
20WMM,2,88,19,23
2GU05,3,0,22,52
V1ZYB,4,58,56,55
WWNMT,5,85,99,5


In [12]:
##access individual columns
df['math_scores']

id
K43QX     2
20WMM    88
2GU05     0
V1ZYB    58
WWNMT    85
95UDH    71
S6C1N    69
2TQQB    72
OH18E    27
2996M    36
MP5CK    58
Y3BIK    96
CAG6L     2
BK3D5    68
O6KJA    57
A19A0     3
LTJFJ    89
FSEC9    54
ED6EN    20
2KZZA    10
N1IEN    35
3VJ9D    93
R0UHX     0
16BSL    66
ZAWP6    80
3XLCZ     4
1DMW6    62
QXCRK    48
Q6FET     7
Q732V    43
Name: math_scores, dtype: int64

In [13]:
##dataframe as a 2d array
df.values

array([[ 1,  2, 69, 89],
       [ 2, 88, 19, 23],
       [ 3,  0, 22, 52],
       [ 4, 58, 56, 55],
       [ 5, 85, 99,  5],
       [ 6, 71, 10, 13],
       [ 7, 69, 39, 29],
       [ 8, 72, 28, 26],
       [ 9, 27, 66, 47],
       [10, 36, 99, 89],
       [11, 58, 66, 64],
       [12, 96, 97, 66],
       [13,  2, 37, 68],
       [14, 68, 49, 96],
       [15, 57, 44,  1],
       [16,  3,  0, 88],
       [17, 89, 38, 64],
       [18, 54, 53, 82],
       [19, 20, 56, 19],
       [20, 10, 25, 55],
       [21, 35,  3, 15],
       [22, 93, 68, 14],
       [23,  0, 45, 44],
       [24, 66, 31, 46],
       [25, 80, 49,  3],
       [26,  4, 42,  8],
       [27, 62, 84, 54],
       [28, 48,  9, 72],
       [29,  7, 40, 88],
       [30, 43, 31, 61]])

## Indexers in Pandas - iloc and loc

In [14]:
##access the 10th row
df.iloc[10]

roll                11
math_scores         58
physics_scores      66
chemistry_scores    64
Name: MP5CK, dtype: int64

In [15]:
##access only math scores for the 10th row
df.iloc[10, 1]

58

In [16]:
##access the first 5 rows of the first 2 columns
df.iloc[:5, :2]

Unnamed: 0_level_0,roll,math_scores
id,Unnamed: 1_level_1,Unnamed: 2_level_1
K43QX,1,2
20WMM,2,88
2GU05,3,0
V1ZYB,4,58
WWNMT,5,85


In [18]:
##access values using labels
df.loc['2GU05':, :'physics_scores']

Unnamed: 0_level_0,roll,math_scores,physics_scores
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2GU05,3,0,22
V1ZYB,4,58,56
WWNMT,5,85,99
95UDH,6,71,10
S6C1N,7,69,39
2TQQB,8,72,28
OH18E,9,27,66
2996M,10,36,99
MP5CK,11,58,66
Y3BIK,12,96,97
