### Set-up

In [3]:
import pandas as pd
import numpy as np

In [4]:
adult_income = pd.read_csv("adult-income.csv")
adult_income = adult_income.rename(columns={"39": "Age", " State-gov": "Workclass", " 77516": "Final_weight", 
                             " Bachelors": "Education", " 13": "Education_num", " Never-married": "Marital_status", 
                             " Adm-clerical": "Occupation", " Not-in-family": "Relationship", " White": "Race", 
                             " Male": "Gender", " 2174": "Capital_gain", " 0": "Capital_loss", 
                             " 40": "hours-per-week", " United-States": "Native_country", 
                             " <=50K": "Income_bracket"})
adult_income.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### Modifying specific values in a DataFrame

In [5]:
print(adult_income.loc[2, :])

Age                                53
Workclass                     Private
Final_weight                   234721
Education                        11th
Education_num                       7
Marital_status     Married-civ-spouse
Occupation          Handlers-cleaners
Relationship                  Husband
Race                            Black
Gender                           Male
Capital_gain                        0
Capital_loss                        0
hours-per-week                     40
Native_country          United-States
Income_bracket                  <=50K
Name: 2, dtype: object


In [6]:
adult_income.loc[2, 'Education'] = 'College'
 
# print row index 2 to verify modification
print(adult_income.loc[2, :])

Age                                53
Workclass                     Private
Final_weight                   234721
Education                     College
Education_num                       7
Marital_status     Married-civ-spouse
Occupation          Handlers-cleaners
Relationship                  Husband
Race                            Black
Gender                           Male
Capital_gain                        0
Capital_loss                        0
hours-per-week                     40
Native_country          United-States
Income_bracket                  <=50K
Name: 2, dtype: object


### Modifying values based on criteria (Boolean indexing)

In [7]:
adult_income.loc[adult_income["Education_num"] >= 16, :]

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
19,40,Private,193524,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
62,42,Private,116632,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,>50K
88,43,Federal-gov,410867,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,>50K
95,48,Self-emp-not-inc,191277,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
188,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
412,46,Private,188386,Doctorate,16,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,60,United-States,>50K
540,29,Private,133937,Doctorate,16,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
573,27,Private,158156,Doctorate,16,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,70,United-States,<=50K
638,52,Self-emp-not-inc,289436,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
644,50,Self-emp-inc,195322,Doctorate,16,Separated,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,>50K


In [8]:
adult_income.loc[adult_income["Education_num"] >= 16, 'Education'] = 'Ph.D.'
 
# Print the rows to verify the modification
adult_income.loc[adult_income["Education_num"] >= 16, :]

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
19,40,Private,193524,Ph.D.,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
62,42,Private,116632,Ph.D.,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,45,United-States,>50K
88,43,Federal-gov,410867,Ph.D.,16,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,>50K
95,48,Self-emp-not-inc,191277,Ph.D.,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,60,United-States,>50K
188,58,State-gov,109567,Ph.D.,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
412,46,Private,188386,Ph.D.,16,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,60,United-States,>50K
540,29,Private,133937,Ph.D.,16,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
573,27,Private,158156,Ph.D.,16,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,70,United-States,<=50K
638,52,Self-emp-not-inc,289436,Ph.D.,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
644,50,Self-emp-inc,195322,Ph.D.,16,Separated,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,>50K


In [9]:
adult_income.loc[adult_income["Education"] == ' HS-grad', 'Education_num'] = 10
print('new mean')
print(adult_income.mean())
print('')
adult_income.head()

new mean
Age                   38.581634
Final_weight      189781.814373
Education_num         10.403102
Capital_gain        1077.615172
Capital_loss          87.306511
hours-per-week        40.437469
dtype: float64



Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,10,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,College,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### Modifying columns

In [10]:
adult_income.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
Age               32560 non-null int64
Workclass         32560 non-null object
Final_weight      32560 non-null int64
Education         32560 non-null object
Education_num     32560 non-null int64
Marital_status    32560 non-null object
Occupation        32560 non-null object
Relationship      32560 non-null object
Race              32560 non-null object
Gender            32560 non-null object
Capital_gain      32560 non-null int64
Capital_loss      32560 non-null int64
hours-per-week    32560 non-null int64
Native_country    32560 non-null object
Income_bracket    32560 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [11]:
adult_income["Income_bracket"] = adult_income["Income_bracket"].astype('category')
 
# check the datatypes
print("Check the dtypes after converting")
adult_income.info()

Check the dtypes after converting
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
Age               32560 non-null int64
Workclass         32560 non-null object
Final_weight      32560 non-null int64
Education         32560 non-null object
Education_num     32560 non-null int64
Marital_status    32560 non-null object
Occupation        32560 non-null object
Relationship      32560 non-null object
Race              32560 non-null object
Gender            32560 non-null object
Capital_gain      32560 non-null int64
Capital_loss      32560 non-null int64
hours-per-week    32560 non-null int64
Native_country    32560 non-null object
Income_bracket    32560 non-null category
dtypes: category(1), int64(6), object(8)
memory usage: 3.5+ MB


In [12]:
adult_income["Workclass"] = adult_income["Workclass"].astype('category')
adult_income["Education"] = adult_income["Education"].astype('category')
adult_income["Marital_status"] = adult_income["Marital_status"].astype('category')
adult_income["Occupation"] = adult_income["Occupation"].astype('category')
adult_income["Relationship"] = adult_income["Relationship"].astype('category')
adult_income["Race"] = adult_income["Race"].astype('category')
adult_income["Gender"] = adult_income["Gender"].astype('category')
adult_income["Native_country"] = adult_income["Native_country"].astype('category')

# check the datatypes
print("Check the dtypes after converting")
print("")
adult_income.info()

Check the dtypes after converting

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
Age               32560 non-null int64
Workclass         32560 non-null category
Final_weight      32560 non-null int64
Education         32560 non-null category
Education_num     32560 non-null int64
Marital_status    32560 non-null category
Occupation        32560 non-null category
Relationship      32560 non-null category
Race              32560 non-null category
Gender            32560 non-null category
Capital_gain      32560 non-null int64
Capital_loss      32560 non-null int64
hours-per-week    32560 non-null int64
Native_country    32560 non-null category
Income_bracket    32560 non-null category
dtypes: category(9), int64(6)
memory usage: 1.8 MB


In [14]:
adult_income["hours_per_day"] = adult_income["hours-per-week"] / 5
adult_income.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket,hours_per_day
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,2.6
1,38,Private,215646,HS-grad,10,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,8.0
2,53,Private,234721,College,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,8.0
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,8.0
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,8.0


In [15]:
data = np.exp(adult_income["Education_num"])
data.head()

0    4.424134e+05
1    2.202647e+04
2    1.096633e+03
3    4.424134e+05
4    1.202604e+06
Name: Education_num, dtype: float64

In [17]:
adult_income["product"] = adult_income["hours-per-week"] * adult_income["Education_num"]
adult_income.head()

Unnamed: 0,Age,Workclass,Final_weight,Education,Education_num,Marital_status,Occupation,Relationship,Race,Gender,Capital_gain,Capital_loss,hours-per-week,Native_country,Income_bracket,hours_per_day,product
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,2.6,169
1,38,Private,215646,HS-grad,10,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,8.0,400
2,53,Private,234721,College,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,8.0,280
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,8.0,520
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,8.0,560


In [21]:
adult_income["edu_with_income"] = (adult_income["Education"] + "-" + adult_income["Income_bracket"])
adult_income.head()

TypeError: Series cannot perform the operation +