* In this challenge you must analyze demographic data using Pandas. 
* You are given a dataset of demographic data that was extracted from the 1994 Census database.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('adult.data.csv')

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


* __How many people of each race are represented in this dataset?__ 
* __This should be a Pandas series with race names as the index labels.__

In [4]:
df['race'].value_counts()              # 1st required

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [5]:
# Average age of men:

age_mean = df.groupby('sex', as_index=False).age.mean()
average_age_men = float(format(age_mean.iloc[1].age, '.1f'))

average_age_men

39.4

In [6]:
df['education'].value_counts()

HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

In [7]:
# percentage of people who have bachelors' degree:

bachelors_percentage = float(format((5355/32561) * 100, '.1f'))

bachelors_percentage

16.4

In [8]:
df['salary'].value_counts()

<=50K    24720
>50K      7841
Name: salary, dtype: int64

* __What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?__

In [9]:
df['education'].describe()

count       32561
unique         16
top       HS-grad
freq        10501
Name: education, dtype: object

In [10]:
# with and without bachelors:

higher_education = df[df['education'].isin(['Bachelors','Masters','Doctorate'])]

lower_education = df[~df['education'].isin(['Bachelors','Masters','Doctorate'])]



In [11]:
higher_education.shape

(7491, 15)

In [12]:
# count of people with greater than 50k:

higher_education_rich = int(higher_education[higher_education['salary'] == '>50K']['salary'].count())
lower_education_rich = int(lower_education[lower_education['salary'] == '>50K']['salary'].count())


In [13]:
print(type(higher_education_rich))
print(type(lower_education_rich))

<class 'int'>
<class 'int'>


In [14]:
# percentage with greater than 50k:

higher_education_rich = float(format((higher_education_rich / higher_education.shape[0] * 100), '.1f'))

lower_education_rich = float(format((lower_education_rich / lower_education.shape[0] * 100), '.1f'))

In [15]:
print(lower_education_rich)
print(higher_education_rich)

17.4
46.5


In [16]:
# minimum num of hours a person works per week:

min_work_hours = df['hours-per-week'].min()

min_work_hours

1

In [17]:
df['hours-per-week'].describe()

count    32561.000000
mean        40.437456
std         12.347429
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: hours-per-week, dtype: float64

In [18]:
# What percentage of the people who work the minimum number of hours per week have a salary of >50K?

num_min_workers = df[df['hours-per-week'] == 1]

num_min_workers.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
189,58,State-gov,109567,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,1,United-States,>50K
1036,66,Self-emp-inc,150726,9th,5,Married-civ-spouse,Exec-managerial,Husband,White,Male,1409,0,1,?,<=50K
1262,69,?,195779,Assoc-voc,11,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5590,78,?,363134,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,0,1,United-States,<=50K
5632,45,?,189564,Masters,14,Married-civ-spouse,?,Wife,White,Female,0,0,1,United-States,<=50K


In [19]:
number =  num_min_workers['hours-per-week'].count()

number

20

In [20]:
rich_percentage_num= num_min_workers[num_min_workers['salary'] == '>50K']['salary'].count()

rich_percentage_num

2

In [21]:
# What percentage of the people who work the minimum number of hours per week have a salary of more than 50K?

rich_percentage = (rich_percentage_num / number) * 100

rich_percentage

10.0

* __What country has the highest percentage of people that earn >50K and what is that percentage?__

In [26]:
df['native-country'].describe()

count             32561
unique               42
top       United-States
freq              29170
Name: native-country, dtype: object

In [27]:
df['native-country'].value_counts()

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                      

In [28]:
rich = df[df['salary'].isin(['>50K'])]

rich

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32545,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K
32554,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


In [29]:
rich['native-country'].value_counts()

United-States         7171
?                      146
Philippines             61
Germany                 44
India                   40
Canada                  39
Mexico                  33
England                 30
Italy                   25
Cuba                    25
Japan                   24
China                   20
Taiwan                  20
Iran                    18
South                   16
Poland                  12
France                  12
Puerto-Rico             12
Jamaica                 10
El-Salvador              9
Greece                   8
Cambodia                 7
Yugoslavia               6
Hong                     6
Vietnam                  5
Ireland                  5
Portugal                 4
Haiti                    4
Ecuador                  4
Scotland                 3
Guatemala                3
Thailand                 3
Hungary                  3
Trinadad&Tobago          2
Columbia                 2
Peru                     2
Nicaragua                2
D

In [30]:
rich_country = rich['native-country'].value_counts().idxmax()

rich_country

'United-States'

In [31]:
total = int(df[df['native-country'] == 'United-States']['native-country'].count())

In [32]:
num = int(rich[rich['native-country']== 'United-States']['native-country'].count())

In [33]:
rich_country_percentage = int((num/total) * 100)

rich_country_percentage

24

In [34]:
india_rich = rich[rich['native-country'] == 'India']

india_rich.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
968,48,Private,164966,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
1327,52,Private,168381,HS-grad,9,Widowed,Other-service,Unmarried,Asian-Pac-Islander,Female,0,0,40,India,>50K
7258,42,State-gov,102343,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,72,India,>50K
7285,54,State-gov,93449,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K


In [35]:
top_IN_occupation = india_rich['occupation'].value_counts().idxmax()

top_IN_occupation

'Prof-specialty'

In [36]:
highest_earning_country = (df[df['salary'] == '>50k']['native-country'].value_counts()/ df['native-country'].value_counts() * 100).sort_values(ascending=False).fillna(0).idxmax()

highest_earning_country

'United-States'

In [37]:
highest_earning_country

'United-States'

In [39]:
# freecodecamp file to upload in the project:


def calculate_demographic_data(print_data=True):
    # Read data from file
    df = pd.read_csv('adult.data.csv')

    # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
    race_count = df['race'].value_counts()

    # What is the average age of men?
    age_mean = df.groupby('sex', as_index=False).age.mean()
    average_age_men = age_mean.iloc[1].age

    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = (5355/32561) * 100

    # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
    # What percentage of people without advanced education make more than 50K?

    # with and without `Bachelors`, `Masters`, or `Doctorate`
    higher_education = df[df['education'].isin(['Bachelors','Masters', 'Doctorate'])]
    lower_education = df[~df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]
    

    # percentage with salary >50K
    higher_education_rich = higher_education[higher_education['salary']== '>50K']['salary'].count()  / higher_education.shape[0]  * 100
    lower_education_rich = lower_education[lower_education['salary']== '>50K']['salary'].count()  / higher_education.shape[0]  * 100

    # What is the minimum number of hours a person works per week (hours-per-week feature)?
    min_work_hours = df['hours-per-week'].min()

    # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
    num_min_workers = df[df['hours-per-week'] == 1]['hours-per-week'].count()
    
    rich_percent_num = num_min_workers[num_min_workers['salary'] == '>50K']['salary'].count()
    
    rich_percentage = rich_percent_num / num_min_workers * 100

    # What country has the highest percentage of people that earn >50K?
    rich = df[df['salary'].isin(['>50K'])]
    
    highest_earning_country = rich['native-country'].value_counts().idxmax()
    total = df[df['native-country'] == 'United-States']['native-country'].count()
    num = rich[rich['native-country']== 'United-States']['native-country'].count()
    
    
    highest_earning_country_percentage = (num/total) * 100

    # Identify the most popular occupation for those who earn >50K in India.
    india_rich = rich[rich['native-country'] == 'India']
    
    top_IN_occupation = india_rich['occupation'].value_counts().idxmax()

    # DO NOT MODIFY BELOW THIS LINE

    if print_data:
        print("Number of each race:\n", race_count) 
        print("Average age of men:", average_age_men)
        print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
        print(f"Percentage with higher education that earn >50K: {higher_education_rich}%")
        print(f"Percentage without higher education that earn >50K: {lower_education_rich}%")
        print(f"Min work time: {min_work_hours} hours/week")
        print(f"Percentage of rich among those who work fewest hours: {rich_percentage}%")
        print("Country with highest percentage of rich:", highest_earning_country)
        print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
        print("Top occupations in India:", top_IN_occupation)

    return {
        'race_count': race_count,
        'average_age_men': average_age_men,
        'percentage_bachelors': percentage_bachelors,
        'higher_education_rich': higher_education_rich,
        'lower_education_rich': lower_education_rich,
        'min_work_hours': min_work_hours,
        'rich_percentage': rich_percentage,
        'highest_earning_country': highest_earning_country,
        'highest_earning_country_percentage':
        highest_earning_country_percentage,
        'top_IN_occupation': top_IN_occupation
    }
