In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')

In [2]:
df = pd.read_csv('adult.data.csv')
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [3]:
# checking the data rows and columns number
df.shape

(32561, 15)

In [4]:
# checking the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


> data types are fine

In [10]:
# checking duplicated values
print("Number of duplicated values is: ",df.duplicated().sum())

Number of duplicated values is:  24


In [9]:
# checking null values
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

> there is no null data in this dataset

In [16]:
def data_preprocessing(data):
    # removing the duplicated values 
    new_df = data[data.duplicated() != True]
    
    # fixing columns name ( replace '-' with '_')
    new_df.columns = [column.replace('-', '_') for column in new_df.columns]
    
    # returning the new data_frame
    return new_df
demographic = data_preprocessing(df)

## Answering some data questions 

In [51]:
demographic.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')

In [100]:
def calculate_demographic_data(print_data=True, data= None):
    # Read data from file
    df = data
    
    # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
    race_count = df.race.value_counts()

    # What is the average age of men?
    average_age_men = round(demographic[demographic.sex == 'Male'].age.mean(), 0)

    # What is the percentage of people who have a Bachelor's degree?
    num_ppl_w_ba = demographic.query('education == "Bachelors"').shape[0]
    percentage_bachelors = round(num_ppl_w_ba/demographic.shape[0] *100 , 0)

    # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
    # What percentage of people without advanced education make more than 50K?

    # with and without `Bachelors`, `Masters`, or `Doctorate`
    # with adv edu
    higher_education = demographic.query('education == "Bachelors" or education == "Masters" or education == "Doctorate"')
    num_adv_eppl_make_over_50k = higher_education.query('salary == ">50K"').shape[0]
    
    # without adv edu
    lower_education = demographic.query('education != "Bachelors" or education != "Masters" or education != "Doctorate"')
    num_none_adv_eppl_make_over_50k = lower_education.query('salary == ">50K"').shape[0]
    
    # percentage with salary >50K
    higher_education_rich = round((num_adv_eppl_make_over_50k / higher_education.shape[0]) *100, 0)
    lower_education_rich = round((num_none_adv_eppl_make_over_50k / lower_education.shape[0]) *100, 0)

    # What is the minimum number of hours a person works per week (hours-per-week feature)?
    min_work_hours = None

    # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
    num_min_workers = None

    rich_percentage = None

    # What country has the highest percentage of people that earn >50K?
    highest_earning_country = None
    highest_earning_country_percentage = None

    # Identify the most popular occupation for those who earn >50K in India.
    top_IN_occupation = None

    # DO NOT MODIFY BELOW THIS LINE

    if print_data:
        # print("Number of each race: \n", race_count, sep="") 
        # print("Average age of men:", average_age_men)
        # print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
        print(f"Percentage with higher education that earn >50K: {higher_education_rich}%")
        print(f"Percentage without higher education that earn >50K: {lower_education_rich}%")
        # print(f"Min work time: {min_work_hours} hours/week")
        # print(f"Percentage of rich among those who work fewest hours: {rich_percentage}%")
        # print("Country with highest percentage of rich:", highest_earning_country)
        # print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
        # print("Top occupations in India:", top_IN_occupation)

    # return {
    #     'race_count': race_count,
    #     'average_age_men': average_age_men,
    #     'percentage_bachelors': percentage_bachelors,
    #     'higher_education_rich': higher_education_rich,
    #     'lower_education_rich': lower_education_rich,
    #     'min_work_hours': min_work_hours,
    #     'rich_percentage': rich_percentage,
    #     'highest_earning_country': highest_earning_country,
    #     'highest_earning_country_percentage':
    #     highest_earning_country_percentage,
    #     'top_IN_occupation': top_IN_occupation
    # }

calculate_demographic_data(data = demographic , print_data=True)

Percentage with higher education that earn >50K: 47.0%
Percentage without higher education that earn >50K: 24.0%


## Test cells

In [61]:
demographic.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'salary'],
      dtype='object')