### Set-Up

In [1]:
import pandas as pd
import numpy as np

In [2]:
# load the dataset
adult_income = pd.read_csv("adult-income.csv")
adult_income = adult_income.rename(columns={"39": "Age", " State-gov": "Workclass", " 77516": "Final_weight", 
                             " Bachelors": "Education", " 13": "Education_num", " Never-married": "Marital_status", 
                             " Adm-clerical": "Occupation", " Not-in-family": "Relationship", " White": "Race", 
                             " Male": "Gender", " 2174": "Capital_gain", " 0": "Capital_loss", 
                             " 40": "hours-per-week", " United-States": "Native_country", 
                             " <=50K": "Income_bracket"})
# just to check data loaded
adult_income.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### Selection using Boolean Array (Boolean Indexing)

In [4]:
# create boolean array
b_arr = adult_income["Education_num"] >= 12
# print the first 10 values
print(b_arr.head(10))

0     True
1    False
2    False
3     True
4     True
5    False
6    False
7     True
8     True
9    False
Name: Education_num, dtype: bool


In [6]:
# selection of subset
temp_df = adult_income[b_arr]
 
# display first few elements
temp_df.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [8]:
temp_df = adult_income[adult_income["Education_num"] >= 12]
temp_df.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [13]:
temp_df = adult_income.loc[adult_income["Education_num"]>=12, :]
temp_df.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [21]:
subset = adult_income[adult_income["Education"] == " Bachelors"]
subset.nlargest(5, "Capital_gain")

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
1527,52,Private,118025,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,99999,0,50,United-States,>50K
1681,43,Private,176270,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,99999,0,60,United-States,>50K
1764,49,Private,159816,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,99999,0,20,United-States,>50K
2318,65,Self-emp-inc,139272,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,99999,0,60,United-States,>50K
6034,78,Self-emp-not-inc,316261,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Male,99999,0,20,United-States,>50K


### Criterias using logical operations — and, or, not

In [24]:
# create boolean array
b_arr = (adult_income["Education_num"] >= 12) & (adult_income["Gender"] == " Female")
 
# selection of subset
temp_df = adult_income[b_arr]
 
# display first few elements
temp_df.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
11,23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,<=50K
18,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K


In [33]:
# create boolean array
b_arr = (adult_income["Capital_gain"] > 1000) | (adult_income["Income_bracket"] == " >50K")
 
# selection of subset
temp_df = adult_income[b_arr]
 
# display first 10 elements
temp_df.head(10)

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
6,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
9,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
10,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
13,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K
18,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K
19,40,Private,193524,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
24,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K
26,54,?,180211,Some-college,10,Married-civ-spouse,?,Husband,Asian-Pac-Islander,Male,0,0,60,South,>50K


In [35]:
# create boolean array
b_arr = ~(adult_income["Education"] == ' Doctorate') 
 
# selection of subset
temp_df = adult_income[b_arr]
 
# display first 10 elements
temp_df.head(30)

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
5,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
6,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
9,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K


In [37]:
b_arr = (adult_income["Education_num"] < 10) & (adult_income["Income_bracket"] == ' >50K')
subset = adult_income[b_arr]
# print shape
print('shape of subset DataFrame')
print(subset.shape)

shape of subset DataFrame
(1919, 15)


### Selection based on datatype

In [38]:
# select numeric dtypes
df_temp = adult_income.select_dtypes(include='number')
 
# display first few elements
df_temp.head()

Unnamed: 0,Age,Final_weight,Education_num,Capital_gain,Capital_loss,hours-per-week
0,50,83311,13,0,0,13
1,38,215646,9,0,0,40
2,53,234721,7,0,0,40
3,28,338409,13,0,0,40
4,37,284582,14,0,0,40
