In [1]:
import numpy as np
import pandas as pd

## Analyzing Demographic Data

### Questions to be answered are:

- How many people of each race are represented in this dataset? This should be a Pandas series with race names as the index labels. (race column)
- What is the average age of men?
- What is the percentage of people who have a Bachelor's degree?
- What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?
- What percentage of people without advanced education make more than 50K?
- What is the minimum number of hours a person works per week?
- What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?
- What country has the highest percentage of people that earn >50K and what is that percentage?
- Identify the most popular occupation for those who earn >50K in India.

In [2]:
df = pd.read_csv("adult.data.csv")

In [52]:
df.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
24337,62,Local-gov,68268,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
25979,67,Private,158301,HS-grad,9,Divorced,Transport-moving,Not-in-family,White,Male,0,0,60,United-States,<=50K
23825,62,?,83439,7th-8th,4,Married-civ-spouse,?,Husband,White,Male,0,0,40,United-States,<=50K
10371,37,Private,164898,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,52,United-States,<=50K
2666,20,Private,118462,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,43,United-States,<=50K


In [4]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [36]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
df.groupby(['race'])['sex'].count()


race
Amer-Indian-Eskimo      311
Asian-Pac-Islander     1039
Black                  3124
Other                   271
White                 27816
Name: sex, dtype: int64

### What is the average age of men?

In [6]:
men = df[df.sex == 'Male']['age'].mean()
men

39.43354749885268

### What is the percentage of people who have a bachelor's degree?

In [7]:
bachelor_percentage = df[df.education == 'Bachelors']['education'].count() / df['education'].count()
bachelor_percentage

0.16446055096587942

### What percentage of people with advanced education (Bachelors, Masters, or Doctorate) make more than 50K?

In [28]:
education_salary = df[['education', 'salary']]
education_salary


advanced = education_salary[(education_salary['education'] == 'Bachelors') | (education_salary['education'] == 'Masters') | (education_salary['education'] == 'Doctorate')]
above_50 = advanced[advanced['salary'] == '>50K']
above_50.count() / advanced.count()

education    0.465358
salary       0.465358
dtype: float64

### - What percentage of people without advanced education make more than 50K?

In [34]:
non_advanced = education_salary.drop(advanced.index)
non_advanced[non_advanced['salary'] == '>50K'].count() / non_advanced.count()

education    0.173714
salary       0.173714
dtype: float64

### What is the minimum number of hours a person works per week?

In [35]:
df['hours-per-week'].min()

1

### What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

In [63]:

salary_hours = df[['salary', 'hours-per-week']]
one = salary_hours[salary_hours['hours-per-week'] == 1]
one[one['salary']=='>50K'].count() / one.count()

salary            0.1
hours-per-week    0.1
dtype: float64

### What country has the highest percentage of people that earn >50K and what is that percentage?

### Identify the most popular occupation for those who earn >50K in India.