## Researching Adult Age Data with Pandas

In [3]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [77]:
data = pd.read_csv('Data/adult_data.csv', names = ['age', 'workclass','fnlwgt','education','education-num',
                                                   'marital','occupation','relationship','race','sex',
                                                   'capital-gain','capital-loss','hours','native-country', 'Earnings'
                                                  ], index_col = False)
whitespacesToTakeOut = ['sex','native-country','Earnings']
for columnName in whitespacesToTakeOut:
    data[columnName] = data[columnName].apply(lambda x: x.strip())    #Strip out whitespace from 'sex' column

## Questions
### 1. How many men and women (sex feature) are represented in this dataset?


In [78]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital,occupation,relationship,race,sex,capital-gain,capital-loss,hours,native-country,Earnings
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [79]:
data.sex.value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

In [80]:
data['native-country'].value_counts()

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                      

### 2. What is the average age (age feature) of women?

In [81]:
age_avg = data[data['sex'] == 'Female']['age'].mean()
print("The average age of women is %s" % age_avg)

The average age of women is 36.85823043357163


### 3. What is the percentage of German citizens (native-country feature)?

In [82]:
native_country = data['native-country']

germanCitizensPercentage = data[native_country== 'Germany']['native-country'].count() / \
native_country.count()
print("The percentage of German Citizens is %s" % germanCitizensPercentage)

The percentage of German Citizens is 0.00420748748503


### 4-5. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?


In [98]:
EarningsStats_50kGreater = data[data['Earnings']== '>50K']['age'].describe()[['mean','std']]
EarningsStats_50kLess =  data[data['Earnings']== '<=50K']['age'].describe()[['mean','std']]
print("Mean and Standard Deviation for earnings greater than 50k\n %s" % EarningsStats_50kGreater)
print("Mean and Standard Deviation for earnings greater than 50k\n %s" % EarningsStats_50kLess)

Mean and Standard Deviation for earnings greater than 50k
 mean    44.249841
std     10.519028
Name: age, dtype: float64
Mean and Standard Deviation for earnings greater than 50k
 mean    36.783738
std     14.020088
Name: age, dtype: float64


### 6. Is it true that people who earn more than 50K have at least high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)


In [106]:
data[['age','Earnings','education']].pivot_table(index = 'Earnings', columns = ['education'],aggfunc ='count')

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,age,age,age,age,age,age,age,age
education,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
Earnings,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
<=50K,871.0,1115.0,400.0,162.0,317.0,606.0,487.0,802.0,1021.0,3134.0,107.0,8826.0,764.0,51.0,153.0,5904.0
>50K,62.0,60.0,33.0,6.0,16.0,40.0,27.0,265.0,361.0,2221.0,306.0,1675.0,959.0,,423.0,1387.0


### 7. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.


In [137]:
raceBreakdown = data[['age','race']].groupby(by = 'race').describe()
print("Age Descriptive Statistics by Gender")
print(raceBreakdown)

genderBreakdown = data[['age','sex']].groupby(by = 'sex').describe()
print("Age Descriptive Statistics by Gender")
print(genderBreakdown)

print("\n\n")
print("The maximum age of men of American Indian-Eskimo race is %s" % pd.DataFrame(raceBreakdown).reset_index().iloc[0,8])

Age Descriptive Statistics by Gender
                         age                                                \
                       count       mean        std   min   25%   50%   75%   
race                                                                         
 Amer-Indian-Eskimo    311.0  37.173633  12.447130  17.0  28.0  35.0  45.5   
 Asian-Pac-Islander   1039.0  37.746872  12.825133  17.0  28.0  36.0  45.0   
 Black                3124.0  37.767926  12.759290  17.0  28.0  36.0  46.0   
 Other                 271.0  33.457565  11.538865  17.0  25.0  31.0  41.0   
 White               27816.0  38.769881  13.782306  17.0  28.0  37.0  48.0   

                           
                      max  
race                       
 Amer-Indian-Eskimo  82.0  
 Asian-Pac-Islander  90.0  
 Black               90.0  
 Other               77.0  
 White               90.0  
Age Descriptive Statistics by Gender
            age                                                    
         

### 8. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.


In [159]:
maritalCount = 0
nonMaritcalCount = 0

peopleEarnMoreThan50 = data[data['Earnings']=='>50K']
totalCount = peopleEarnMoreThan50['marital'].count()

for key, val in dict(peopleEarnMoreThan50.marital.value_counts()).items():
    if "Mar" in key:
        maritalCount = maritalCount + val
    else:
        nonMaritcalCount = nonMaritcalCount + val
maritalProportion = nonMaritcalCount / totalCount
nonMaritalProportion = maritalCount / totalCount

print('Proportion of Martial Individuals who earn a lot: %s' % maritalProportion)
print('Proportion of Non-Martial Individuals who earn a lot: %s' % nonMaritalProportion)

Proportion of Martial Individuals who earn a lot: 0.140925902308
Proportion of Martial Individuals who do not earn a lot: 0.859074097692


### 9. What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?


In [189]:
maxHoursWorked = data['hours'].max()
print("Maximum hours a person works per week %s" % maxHoursWorked)

thosewhoWork99 = data[data['hours']==maxHoursWorked]['hours'].count()
print("\nNumber of people who work %s: %s" % (maxHoursWorked,thosewhoWork99))

those99AndGreater50K_pt1 = data[data['hours']==maxHoursWorked]
those99AndGreater50K_pt2 = those99AndGreater50K_pt1[those99AndGreater50K_pt1['Earnings'] == '>50K']['hours'].count() / peopleEarnMoreThan50['hours'].count() * 100
print("\n Percentage of Those who earned >50k among People who work %s hours: %s" % (those99AndGreater50K_pt2, those99AndGreater50K_pt2),'%')

Maximum hours a person works per week 99

Number of people who work 99: 85

 Percentage of Those who earned >50k among People who work 0.318836883051 hours: 0.318836883051 %


## Resources


### Data Dictionary
1. age
2. workclass
3. fnlwgt
4. education
5. education-num
6. 'marital
7. occupation
8. relationship
9. race
10. sex
11. capital-gain
12. capital-loss
13. hours
14. native-country
15. earnings