In [1]:
import numpy as np
import pandas as pd

pd.set_option("display.max.columns", 100)
# to draw pictures in jupyter notebook
%matplotlib inline
# we don't like warnings
# you can comment the following 2 lines if you'd like to
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

In [2]:
DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [3]:
data = pd.read_csv(DATA_URL + "adult.data.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data['sex'].value_counts()

sex
Male      21790
Female    10771
Name: count, dtype: int64

In [5]:
data[data['sex'] == 'Female']['age'].mean()

np.float64(36.85823043357163)

In [7]:
german_count = data[data['native-country'] == 'Germany'].shape[0]
total = data.shape[0]
percentage_germans = (german_count / total) * 100
percentage_germans

0.42074874850281013

In [8]:
data.groupby('salary')['age'].agg(['mean', 'std'])

Unnamed: 0_level_0,mean,std
salary,Unnamed: 1_level_1,Unnamed: 2_level_1
<=50K,36.783738,14.020088
>50K,44.249841,10.519028


In [9]:
higher_ed = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']
invalid = data[(data['salary'] == '>50K') & (~data['education'].isin(higher_ed))]
invalid.shape[0] == 0  # If True, then the statement is correct

False

In [10]:
# Age statistics
data.groupby(['race', 'sex'])['age'].describe()

# Max age of Amer-Indian-Eskimo men
data[(data['race'] == 'Amer-Indian-Eskimo') & (data['sex'] == 'Male')]['age'].max()

np.int64(82)

In [12]:
data_men = data[data['sex'] == 'Male'].copy()

data_men['married'] = data_men['marital-status'].apply(
    lambda x: 'Married' if x.startswith('Married') else 'Single'
)

data_men.groupby('married')['salary'].value_counts(normalize=True).unstack()['>50K']

married
Married    0.440514
Single     0.084495
Name: >50K, dtype: float64

In [14]:
max_hours = data['hours-per-week'].max()

workers_max_hours = data[data['hours-per-week'] == max_hours]
num_max_workers = workers_max_hours.shape[0]
percent_earning_50k = (workers_max_hours['salary'] == '>50K').mean() * 100

max_hours, num_max_workers, percent_earning_50k

(np.int64(99), 85, np.float64(29.411764705882355))

In [16]:
country_hours = data.groupby(['native-country', 'salary'])['hours-per-week'].mean().unstack()

# Average working hours in Japan
country_hours.loc['Japan']

salary
<=50K    41.000000
>50K     47.958333
Name: Japan, dtype: float64