In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

### Import and General Data Info

In [23]:
df = pd.read_csv('./datasets/adult.data.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [25]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [26]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

### Insights

In [27]:
# Number of people of each race are represented in the dataset
# df.groupby('race').race.count() ## Alt approach
df.race.value_counts()

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [28]:
# Number of each sex represented in the dataset
# df.groupby('sex').sex.count() ## Alt approach
df.sex.value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

In [29]:
# Average age of men
# df.loc[df['sex'] == 'Male'].age.mean() ## Alt approach
df[df['sex'] == 'Male'].age.mean()

39.43354749885268

In [30]:
# Percentage of people who have a Bachelor's degree

# bachelors_count = df.loc[df['education'] == 'Bachelors', 'education'].value_counts()
# bachelors_count = len(df[df['education'] == 'Bachelors'])
bachelors_count = (df['education'] == 'Bachelors').sum()
bachelors_count/df.shape[0] * 100

16.44605509658794

In [31]:
# Percentage of people with advanced education (Bachelors, Masters, or Doctorate) who make more than 50K

# len(df[df['education'].map(lambda x: x == ('Bachelors' or  'Masters' or 'Doctorate'))])
# df.loc[df['education'] == ('Bachelors' or 'Masters' or 'Doctorate'), 'salary'].value_counts(dropna=False)
num_high_bachelors_earners = df[(df.education == ('Bachelors')) & (df.salary == '>50K')].salary.count()
num_high_masters_earners = df[(df.education == 'Masters') & (df.salary == '>50K')].salary.count()
num_high_doctorate_earners = df[(df.education == 'Doctorate') & (df.salary == '>50K')].salary.count()

num_high_advanced_educated_earners = num_high_bachelors_earners + num_high_masters_earners + num_high_doctorate_earners

num_high_advanced_educated = df[df.education == 'Bachelors'].salary.count() + df[df.education == 'Masters'].salary.count() + df[df.education == 'Doctorate'].salary.count()

percent_high_advanced_educated_earners = num_high_advanced_educated_earners/num_high_advanced_educated * 100
percent_high_advanced_educated_earners

46.535843011613935

In [32]:
# Percentage of people without advanced education who make more than 50K
num_non_advanced_edcated = df[(df['education'] != 'Bachelors') & (df['education'] != 'Masters') & (df['education'] != 'Doctorate')].salary.count()

num_high_non_advanced_educated_earners = df[(df['education'] != 'Bachelors') & (df['education'] != 'Masters') & (df['education'] != 'Doctorate') & (df['salary'] == '>50K')].salary.count()


percent_high_non_advanced_educated_earners = num_high_non_advanced_educated_earners/num_non_advanced_edcated * 100
percent_high_non_advanced_educated_earners

17.3713601914639

In [33]:
num_low_non_advanced_educated_earners = df[(df['education'] != 'Bachelors') & (df['education'] != 'Masters') & (df['education'] != 'Doctorate') & (df['salary'] != '>50K')].salary.count()

percent_low_non_advanced_educated_earners = num_low_non_advanced_educated_earners/num_non_advanced_edcated * 100

percent_low_non_advanced_educated_earners

82.6286398085361

In [34]:
# Minimum number of hours a person works per week
min_work_hours = df['hours-per-week'].min()
min_work_hours

1

##### Additional Insights

In [35]:
# Maximum number of hours a person works per week
df['hours-per-week'].max()

99

In [36]:
# Average number of hours a person works per week
df['hours-per-week'].mean()

40.437455852092995

#### Back to the main topic

In [37]:
# Percentage of the people who work the minimum number of hours per week who have a salary of more than 50K
num_min_workers = df[df['hours-per-week'] == min_work_hours]

num_min_workers_salary_over50 = df[(df['hours-per-week'] == min_work_hours) & (df['salary'] == '>50K')]

percent_min_workers_aslary_over50 = num_min_workers_salary_over50.salary.count()/num_min_workers.salary.count() * 100
percent_min_workers_aslary_over50

10.0

In [38]:
# Percentage of people earning salary over 50k by country

# num_salary_over50_by_country = df[df['salary'] == '>50K']['native-country'].value_counts()
num_salary_over50_by_country = df[df['salary'] == '>50K'].groupby('native-country')['salary'].count()

num_by_country = df['native-country'].value_counts()
percent_salary_over50_by_country = num_salary_over50_by_country/num_by_country * 100
percent_salary_over50_by_country.sort_values(ascending=False)

Iran                          41.860465
France                        41.379310
India                         40.000000
Taiwan                        39.215686
Japan                         38.709677
Yugoslavia                    37.500000
Cambodia                      36.842105
Italy                         34.246575
England                       33.333333
Canada                        32.231405
Germany                       32.116788
Philippines                   30.808081
Hong                          30.000000
Greece                        27.586207
China                         26.666667
Cuba                          26.315789
?                             25.042882
Scotland                      25.000000
United-States                 24.583476
Hungary                       23.076923
Ireland                       20.833333
South                         20.000000
Poland                        20.000000
Thailand                      16.666667
Ecuador                       14.285714


In [39]:
# Country that has the highest percentage of people that earn >50K and the percentage
print(percent_salary_over50_by_country.idxmax(), percent_salary_over50_by_country.max())

Iran 41.86046511627907


In [40]:
# Most popular occupation for those who earn >50K in India
top_demographics_over50_india = df[(df['salary'] == '>50K') & (df['native-country'] == 'India')]
top_demographics_over50_india.occupation.value_counts().idxmax()

'Prof-specialty'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=67830035-8b85-459b-8213-52c1ff5a6b3a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>