# UCI Adult Income Dataset - Exploratory
This notebook is focused on the exploratory and descriptive analysis of the cleaned

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import plotly.express as px

## Define and Create Paths

In [3]:
# Get working directory
current_dir = os.getcwd()

# Go one directory up to the root directory
project_root_dir = os.path.dirname(current_dir)

# define paths to the data files
data_dir = os.path.join(project_root_dir, 'data')
raw_dir = os.path.join(data_dir, 'raw')
processed_dir = os.path.join(data_dir, 'processed')

# define paths to results folder
results_dir = os.path.join(project_root_dir, 'results')

# define paths to docs folder
docs_dir = os.path.join(project_root_dir, 'docs')

# create directories if they do not exist
os.makedirs(raw_dir, exist_ok = True)
os.makedirs(processed_dir, exist_ok = True)
os.makedirs(results_dir, exist_ok = True)
os.makedirs(docs_dir, exist_ok = True)

## Read in the data

In [4]:
adult_data_filename = os.path.join(processed_dir, "adult_cleaned.csv")
adult_df = pd.read_csv(adult_data_filename)
adult_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income,education_level,occupation_grouped,native_region,age_group
0,39,government,77516,13,single,not-in-family,white,male,2174,0,40,<=50k,tertiary,white collar,north america,36-45
1,50,self-employed,83311,13,married,husband,white,male,0,0,13,<=50k,tertiary,white collar,north america,46-60
2,38,private,215646,9,divorced or separated,not-in-family,white,male,0,0,40,<=50k,secondary-school graduate,blue collar,north america,36-45
3,53,private,234721,7,married,husband,black,male,0,0,40,<=50k,secondary,blue collar,north america,46-60
4,28,private,338409,13,married,wife,black,female,0,0,40,<=50k,tertiary,white collar,central america,26-35
5,37,private,284582,14,married,wife,white,female,0,0,40,<=50k,tertiary,white collar,north america,36-45
6,49,private,160187,5,divorced or separated,not-in-family,black,female,0,0,16,<=50k,secondary,service,central america,46-60
7,52,self-employed,209642,9,married,husband,white,male,0,0,45,>50k,secondary-school graduate,white collar,north america,46-60
8,31,private,45781,14,single,not-in-family,white,female,14084,0,50,>50k,tertiary,white collar,north america,26-35
9,42,private,159449,13,married,husband,white,male,5178,0,40,>50k,tertiary,white collar,north america,36-45


## Check the shape of the dataset and datatypes

In [5]:
adult_df.shape

(32515, 16)

In [6]:
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32515 entries, 0 to 32514
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 32515 non-null  int64 
 1   workclass           32515 non-null  object
 2   fnlwgt              32515 non-null  int64 
 3   education_num       32515 non-null  int64 
 4   marital_status      32515 non-null  object
 5   relationship        32515 non-null  object
 6   race                32515 non-null  object
 7   sex                 32515 non-null  object
 8   capital_gain        32515 non-null  int64 
 9   capital_loss        32515 non-null  int64 
 10  hours_per_week      32515 non-null  int64 
 11  income              32515 non-null  object
 12  education_level     32515 non-null  object
 13  occupation_grouped  32515 non-null  object
 14  native_region       32515 non-null  object
 15  age_group           32515 non-null  object
dtypes: int64(6), object(10

### Summary Statistics
#### Numerical Variables

In [7]:
adult_df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32515.0,32515.0,32515.0,32515.0,32515.0,32515.0
mean,38.590374,189791.2,10.081593,1079.173428,87.427341,40.441089
std,13.638535,105576.6,2.571943,7390.403187,403.231777,12.34983
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117830.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237047.5,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


## Categorical Variables

In [8]:
adult_df.describe(include='object')

Unnamed: 0,workclass,marital_status,relationship,race,sex,income,education_level,occupation_grouped,native_region,age_group
count,32515,32515,32515,32515,32515,32515,32515,32515,32515,32515
unique,7,4,6,5,2,2,7,5,6,7
top,private,married,husband,white,male,<=50k,secondary-school graduate,white collar,north america,26-35
freq,22652,14984,13178,27773,21760,24679,10485,16533,30019,8501


In [9]:
adult_df['workclass'].value_counts()

private          22652
self-employed     3656
government        2257
local-gov         2093
unknown           1836
voluntary           14
unemployed           7
Name: workclass, dtype: int64

In [10]:
adult_df['marital_status'].value_counts(normalize=True)

married                  0.460833
single                   0.327664
divorced or separated    0.180963
widowed                  0.030540
Name: marital_status, dtype: float64

In [11]:
adult_df['relationship'].value_counts(normalize=True)

husband           0.405290
not-in-family     0.254775
own-child         0.155590
unmarried         0.105951
wife              0.048224
other-relative    0.030171
Name: relationship, dtype: float64

In [12]:
adult_df['race'].value_counts(normalize=True)

white                        0.854160
black                        0.096017
asian or pacific islander    0.031924
american indian or eskimo    0.009565
other                        0.008335
Name: race, dtype: float64

## Income Distribution

In [13]:
adult_df_income = adult_df.groupby('income').size().reset_index(name='total')
adult_df_income

Unnamed: 0,income,total
0,<=50k,24679
1,>50k,7836


In [15]:
pip install -U kaleido

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install -U plotly

Note: you may need to restart the kernel to use updated packages.


In [17]:
fig = px.pie(adult_df_income, names='income', values='total', title='Overall Income Distribution', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()