In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
educational-num     int64
marital-status     object
occupation         object
relationship       object
race               object
gender             object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
income             object
dtype: object

In [3]:
# First we look for missing values
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [4]:
# Replace '?' with nan
df.replace('?', np.nan, inplace = True)
df.isnull().sum()

age                   0
workclass          1836
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         1843
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      583
income                0
dtype: int64

In [5]:
df.describe(include=['object'])

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
count,30725,32561,32561,30718,32561,32561,32561,31978,32561
unique,8,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22696,10501,14976,4140,13193,27816,21790,29170,24720


In [6]:
# Replace nan value for column workclass, occupation & native-country
df['workclass'].replace(np.nan, 'Private', inplace = True)
df['occupation'].replace(np.nan, 'Prof-specialty', inplace = True)
df['native-country'].replace(np.nan, 'United-States', inplace = True)

df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [7]:
# Remove duplicate data
print(df.count())

df = df.drop_duplicates()
print(df.count())

age                32561
workclass          32561
fnlwgt             32561
education          32561
educational-num    32561
marital-status     32561
occupation         32561
relationship       32561
race               32561
gender             32561
capital-gain       32561
capital-loss       32561
hours-per-week     32561
native-country     32561
income             32561
dtype: int64
age                32537
workclass          32537
fnlwgt             32537
education          32537
educational-num    32537
marital-status     32537
occupation         32537
relationship       32537
race               32537
gender             32537
capital-gain       32537
capital-loss       32537
hours-per-week     32537
native-country     32537
income             32537
dtype: int64


In [8]:
# 1. How many men and women (sex feature) are represented in this dataset?
df['gender'].value_counts()

Male      21775
Female    10762
Name: gender, dtype: int64

In [9]:
# 2. What is the average age (age feature) of women?
df_group = df[['gender', 'age']]
df_group = df_group.groupby(['gender'], as_index = False).mean()
df_group.iloc[0]

gender       Female
age       36.864709
Name: 0, dtype: object

In [10]:
# 3. What is the percentage of German citizens (native-country feature)?
x= df['native-country'].value_counts()/df['native-country'].count() * 100

print('Germany:', x['Germany'])

Germany: 0.4210591019454774


In [79]:
# 4. What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those
# who earn less than 50K per year?
print('Those who earn more than 50K:')
print('Mean:', df.loc[df['income'] == '>50K','age'].mean())
print('Standard deviation:', df.loc[df['income'] == '>50K', 'age'].std())
print('')
print('Those who earn less than 50K:')
print('Mean:', df.loc[df['income'] == '<=50K', 'age'].mean())
print('Standard deviation:', df.loc[df['income'] == '<=50K', 'age'].std())

Those who earn more than 50K:
Mean: 44.25092486286516
Standard deviation: 10.52010208815647

Those who earn less than 50K:
Mean: 36.78739169163495
Standard deviation: 14.017334627959187


In [82]:
# 5. Is it true that people who earn more than 50K have at least a high school education? (education – Bachelors, Prof-school, 
# Assoc-acdm, Assoc-voc, Masters or Doctorate feature)

df.loc[df['income'] == '>50K','education'].value_counts()

Bachelors       2221
HS-grad         1674
Some-college    1386
Masters          959
Prof-school      423
Assoc-voc        361
Doctorate        306
Assoc-acdm       265
10th              62
11th              60
7th-8th           40
12th              33
9th               27
5th-6th           16
1st-4th            6
Name: education, dtype: int64

No, it's not true that people who earn more than 50K have at least a high school education

In [84]:
# 6. Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find
# the maximum age of men of Amer-Indian-Eskimo race.
my_dfgroup = df[['age', 'race', 'gender']]
print(my_dfgroup.groupby(['race', 'gender']).describe())

                               age                                          \
                             count       mean        std   min   25%   50%   
race               gender                                                    
Amer-Indian-Eskimo Female    119.0  37.117647  13.114991  17.0  27.0  36.0   
                   Male      192.0  37.208333  12.049563  17.0  28.0  35.0   
Asian-Pac-Islander Female    346.0  35.089595  12.300845  17.0  25.0  33.0   
                   Male      692.0  39.000000  12.746669  18.0  29.0  37.0   
Black              Female   1555.0  37.854019  12.637197  17.0  28.0  37.0   
                   Male     1567.0  37.699426  12.880714  17.0  27.0  36.0   
Other              Female    109.0  31.678899  11.631599  17.0  23.0  29.0   
                   Male      162.0  34.654321  11.355531  17.0  26.0  32.0   
White              Female   8633.0  36.819646  14.331045  17.0  25.0  35.0   
                   Male    19162.0  39.656560  13.436249  17.0  

The maximum age of men of Amer-Indian-Eskimo race is 82.0

In [85]:
# 7. Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? 
# Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or 
# Married-AF-spouse), the rest are considered bachelors.

df['marital-status'].unique()

array(['Never-married', 'Married-civ-spouse', 'Divorced',
       'Married-spouse-absent', 'Separated', 'Married-AF-spouse',
       'Widowed'], dtype=object)

In [87]:
mapping={'Never-married':'Bachelors', 'Divorced':'Bachelors','Separated':'Bachelors','Widowed':'Bachelors',
         'Married-civ-spouse':'Married', 'Married-spouse-absent':'Married','Married-AF-spouse':'Married'}

df['status'] = df['marital-status'].map(mapping)
df['status'].value_counts()

Bachelors    17126
Married      15411
Name: status, dtype: int64

In [93]:
print(df.loc[df['income'] == '>50K', 'status'].value_counts())

Married      6734
Bachelors    1105
Name: status, dtype: int64


In [110]:
# 8. What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number
# of hours, and what is the percentage of those who earn a lot (>50K) among them?
x = df['hours-per-week'].max()
print('Maximum number of hours:', x)

Maximum number of hours: 99


In [126]:
y = df.loc[df['hours-per-week'] == x,'hours-per-week'].count()
print('Number of people:', y)

Number of people: 85


In [131]:
print('The percentage of rich people:', df[(df['hours-per-week'] == x) &  (df['income'] == '>50K')].shape[0] / y * 100)

The percentage of rich people: 29.411764705882355


In [141]:
# 9. Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country 
# (native-country). What will these be for Japan?
mygroup = df[['hours-per-week', 'income', 'native-country']]
x = mygroup.groupby(['income','native-country'], as_index = False).mean()
print('Average time of work:')
print(x)

Average time of work:
   income   native-country  hours-per-week
0   <=50K         Cambodia       41.416667
1   <=50K           Canada       37.914634
2   <=50K            China       37.381818
3   <=50K         Columbia       38.684211
4   <=50K             Cuba       37.985714
..    ...              ...             ...
75   >50K         Thailand       58.333333
76   >50K  Trinadad&Tobago       40.000000
77   >50K    United-States       45.506630
78   >50K          Vietnam       39.200000
79   >50K       Yugoslavia       49.500000

[80 rows x 3 columns]


In [142]:
for row in x.values.tolist():
    if row[1] == 'Japan':
        print(row)

['<=50K', 'Japan', 41.0]
['>50K', 'Japan', 47.958333333333336]


In [147]:
# 10. Find out the total number of hours worked and mean salary as per different occupations.
x = df[['occupation','hours-per-week']]
x.groupby(['occupation']).sum()

Unnamed: 0_level_0,hours-per-week
occupation,Unnamed: 1_level_1
Adm-clerical,141545
Armed-Forces,366
Craft-repair,173205
Exec-managerial,182890
Farming-fishing,46618
Handlers-cleaners,51933
Machine-op-inspct,81505
Other-service,114213
Priv-house-serv,4820
Prof-specialty,234139
