In [98]:
import pandas as pd


def calculate_demographic_data(print_data=True):
    # Read data from file
    df = pd.read_csv("adult.data.csv")
    
    # How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
    race_count = pd.Series(df["race"].value_counts())

    # What is the average age of men?
    average_age_men = round(df[df['sex'] == 'Male']['age'].mean(),1)

    # What is the percentage of people who have a Bachelor's degree?
    percentage_bachelors = round((df['education'] == 'Bachelors').mean()*100,1)

    # What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
    # What percentage of people without advanced education make more than 50K?

    # with and without `Bachelors`, `Masters`, or `Doctorate`
    higher_education = df[df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]
    lower_education = df[~df['education'].isin(['Bachelors', 'Masters', 'Doctorate'])]


    # percentage with salary >50K
    higher_education_rich = round((higher_education['salary'] == '>50K').mean() * 100,1)
    lower_education_rich = round((lower_education['salary'] == '>50K').mean() * 100,1)

    # What is the minimum number of hours a person works per week (hours-per-week feature)?
    min_work_hours = df['hours-per-week'].min()

    # What percentage of the people who work the minimum number of hours per week have a salary of >50K?
    min_workers = df[df['hours-per-week'] == min_work_hours]

    num_min_workers = len(min_workers)
    rich_percentage = round((min_workers['salary'] == '>50K').mean() * 100) if num_min_workers > 0 else 0


    # What country has the highest percentage of people that earn >50K?
    # -Filter for people who make more than 50K
    df_filtered = df[df['salary'] == '>50K']

    # -Calculate percentage of people per country that make more than 50K
    percent_rich_people = df_filtered.groupby('native-country')['salary'].count() / df.groupby('native-country')['salary'].count()

    # -Sort by salary percentage in descending order
    percent_rich_people_sorted = percent_rich_people.sort_values(ascending=False)

    # -Get the country with the highest percentage
    highest_earning_country = percent_rich_people_sorted.index[0]
    highest_earning_country_percentage = round(percent_rich_people_sorted.iloc[0]*100,1)

    # Identify the most popular occupation for those who earn >50K in India.
    india_rich = df[(df['salary'] == '>50K') & (df['native-country'] == 'India')]
    top_IN_occupation = india_rich['occupation'].mode().iloc[0] 

#---------------------
    # TEST
    india = (df['native-country'] == 'India').count()
    list_india = df.loc[df['native-country'] == 'India']
    num_workclass_types = list_india['workclass'].nunique() #number of workclass
    india_workclasses = list_india['workclass'].unique() # list of workclass

#---------------------    
    # DO NOT MODIFY BELOW THIS LINE

    if print_data:
        print("\nColumn\n",df.columns,"\n")
        
        print("Describe\n",df.describe(),"\n")
        print("\nSize",df.size,"\n")
        print("India total",india,"\n")
        print("Des\n",list_india,"\n")
        print("How many work class in india\n",num_workclass_types,"\n")
        print("List of work class\n",india_workclasses,"\n")
        print("Number of each race:\n", race_count) 
        print("Average age of men:", average_age_men)
        print(f"Percentage with Bachelors degrees: {percentage_bachelors}%")
        print(f"Percentage with higher education that earn >50K: {higher_education_rich}%")
        print(f"Percentage without higher education that earn >50K: {lower_education_rich}%")
        print(f"Min work time: {min_work_hours} hours/week")
        print(f"Percentage of rich among those who work fewest hours: {rich_percentage}%")
        print("Country with highest percentage of rich:", highest_earning_country)
        print(f"Highest percentage of rich people in country: {highest_earning_country_percentage}%")
        print("Top occupations in India:", top_IN_occupation)


calculate_demographic_data()


Column
 Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object') 

Describe
                 age        fnlwgt  education-num  capital-gain  capital-loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

      

In [None]:
df.head(n)
