In [1]:
# External imports
import pandas as pd

In [2]:
# Read data from the file
df = pd.read_csv("adult.data.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
salary            object
dtype: object

In [4]:
df.isna().any()

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
salary            False
dtype: bool

In [58]:
number_races = df.race.unique().shape[0]
print(f"The number of races is: {number_races}")
df.groupby("race").race.count().sort_values(ascending=False)

The number of races is: 5


race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

In [6]:
for sex in ["Male", "Female"]:
    print(f"The average age of {sex} is: {df.age[df.sex == sex].mean()}")

The average age of Male is: 39.43354749885268
The average age of Female is: 36.85823043357163


In [7]:
educa = df.education[df.education == "Bachelors"]
print(f"The percentage of people who have a Bachelor's degree is {educa.count()*100/df.education.count()}%")

The percentage of people who have a Bachelor's degree is 16.446055096587944%


In [65]:
df.groupby("education")["education-num"].mean()

education
10th             6.0
11th             7.0
12th             8.0
1st-4th          2.0
5th-6th          3.0
7th-8th          4.0
9th              5.0
Assoc-acdm      12.0
Assoc-voc       11.0
Bachelors       13.0
Doctorate       16.0
HS-grad          9.0
Masters         14.0
Preschool        1.0
Prof-school     15.0
Some-college    10.0
Name: education-num, dtype: float64

In [8]:
hi_edu = df.salary[df["education-num"] >= 13]
lo_edu = df.salary[df["education-num"] < 13]

hi_edu_per = hi_edu[hi_edu == ">50K"].count()*100/hi_edu.count()
lo_edu_per = lo_edu[lo_edu == ">50K"].count()*100/lo_edu.count()

print(f"The percentage of people with advanced education that make more than 50K is {hi_edu_per}%")
print(f"The percentage of people without advanced education that make more than 50K is {lo_edu_per}%")

The percentage of people with advanced education that make more than 50K is 48.45667534399405%
The percentage of people without advanced education that make more than 50K is 16.052910916959256%


In [9]:
min_hours = df["hours-per-week"].min()
print(f"The minimum number of hours a person works per week is {min_hours}")

The minimum number of hours a person works per week is 1


In [71]:
peo_h_min = df.salary[df["hours-per-week"] == min_hours]
min_h_per = peo_h_min[peo_h_min == ">50K"].count()*100/peo_h_min.count()
print(f"The percentage of the people who work the minimum number of hours per week have a salary of >50K is {min_h_per}%")

The percentage of the people who work the minimum number of hours per week have a salary of >50K is 10.0%


In [31]:
country_hi_per = df[df.salary == ">50K"].groupby("native-country").salary.count()/df.groupby("native-country").salary.count()
country_hi = country_hi_per[country_hi_per == country_hi_per.max()].index[0]

print(f"The country with the highest percentage of people that earn >50K is {country_hi} with a {country_hi_per[country_hi]*100}%")

The country with the highest percentage of people that earn >50K is Iran with a 41.86046511627907%


In [48]:
occu_max_num = df[(df["native-country"] == "India") & (df.salary == ">50K")].groupby("occupation").occupation.count()
occu_max = occu_max_num[occu_max_num == occu_max_num.max()].index[0]
print(f"The most popular occupation for those who earn >50K in India is {occu_max}")

The most popular occupation for those who earn >50K in India is Prof-specialty
