In [108]:
#import libraries
import pandas as pd

In [109]:
#read the data adult.csv
with open("adult.csv", "r") as csv_file:
    adult_data = pd.read_csv(csv_file, delimiter = ",")

In [110]:
#quick check of your data
#Printing first few lines to check
print(adult_data.head())

#number of non-header rows in data for personal reference:
data_length = adult_data.shape[0]-1
print(f"The number of non-header data entries is: {data_length}")

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country salary  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [111]:
#How many males and females are represented in this dataset

#Gender will occur once per line, so we could do a simple count.

adult_data['sex'].value_counts()
#If "male" is directly counted, it will include all values.


Male      21790
Female    10771
Name: sex, dtype: int64

In [112]:
#What is the average age of women

# adult_data.loc[adult_data['sex']=='Female', 'age'].mean()

adult_data[(adult_data['sex']=='Male') * (adult_data['marital-status'] == 'Never-married')]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
13,32,Private,205019,Assoc-acdm,12,Never-married,Sales,Not-in-family,Black,Male,0,0,50,United-States,<=50K
16,25,Self-emp-not-inc,176756,HS-grad,9,Never-married,Farming-fishing,Own-child,White,Male,0,0,35,United-States,<=50K
17,32,Private,186824,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,White,Male,0,0,40,United-States,<=50K
26,19,Private,168294,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32537,30,Private,345898,HS-grad,9,Never-married,Craft-repair,Not-in-family,Black,Male,0,0,46,United-States,<=50K
32548,65,Self-emp-not-inc,99359,Prof-school,15,Never-married,Prof-specialty,Not-in-family,White,Male,1086,0,60,United-States,<=50K
32553,32,Private,116138,Masters,14,Never-married,Tech-support,Not-in-family,Asian-Pac-Islander,Male,0,0,11,Taiwan,<=50K
32555,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [113]:
#How many are male and never-married

filtered_data = adult_data[(adult_data['sex'] == 'Male') * (adult_data['marital-status']=='Never-married')]

filtered_data.shape[0]


5916

In [114]:
#What is the proportion of Cuban citizens (native-country feature) in the dataset?
(adult_data['native-country']=='Cuba').sum()/adult_data.shape[0]

0.0029176008107859096

In [115]:
#What are mean value and standard deviation of the age of those who recieve more than 50K per year (salary feature)
adult_data.loc[adult_data['salary']=='>50K', 'age'].mean()

44.24984058155847

In [116]:
#Is it true that people who receive more than 50k have at least Master's education?
adult_data.loc[adult_data['salary']=='>50K', 'education'].unique()

array(['HS-grad', 'Masters', 'Bachelors', 'Some-college', 'Assoc-voc',
       'Doctorate', 'Prof-school', 'Assoc-acdm', '7th-8th', '12th',
       '10th', '11th', '9th', '5th-6th', '1st-4th'], dtype=object)

In [117]:
#Display statistics of age for each race (race feature) and each gender. Use groupby() and describe(). 
for (race,sex), sub_df in adult_data.groupby(['race', 'sex']):
    # print(race,sex)
    print(f"Race: {race}, Sex: {sex}")
    print(sub_df.describe())

Race: Amer-Indian-Eskimo, Sex: Female
              age         fnlwgt  education-num  capital-gain  capital-loss  \
count  119.000000     119.000000     119.000000    119.000000    119.000000   
mean    37.117647  112950.731092       9.697479    544.605042     14.462185   
std     13.114991   93207.974077       2.334540   2451.591587    157.763811   
min     17.000000   12285.000000       2.000000      0.000000      0.000000   
25%     27.000000   31387.000000       9.000000      0.000000      0.000000   
50%     36.000000   87950.000000      10.000000      0.000000      0.000000   
75%     46.000000  163027.500000      11.000000      0.000000      0.000000   
max     80.000000  445168.000000      16.000000  15024.000000   1721.000000   

       hours-per-week  
count      119.000000  
mean        36.579832  
std         11.046509  
min          4.000000  
25%         35.000000  
50%         40.000000  
75%         40.000000  
max         84.000000  
Race: Amer-Indian-Eskimo, Sex: Mal

In [118]:
#List all records that have a 'C' in the native-country


adult_data[adult_data['native-country'].apply(lambda country: 'C' in country)]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
81,52,Private,276515,Bachelors,13,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Cuba,<=50K
112,56,Self-emp-not-inc,335605,HS-grad,9,Married-civ-spouse,Other-service,Husband,White,Male,0,1887,50,Canada,>50K
228,75,Private,314209,Assoc-voc,11,Widowed,Adm-clerical,Not-in-family,White,Female,0,0,20,Columbia,<=50K
255,42,Self-emp-not-inc,303044,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,Asian-Pac-Islander,Male,0,0,40,Cambodia,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32112,32,Self-emp-inc,209691,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,Canada,<=50K
32115,29,Private,119359,HS-grad,9,Married-civ-spouse,Prof-specialty,Wife,Asian-Pac-Islander,Female,0,0,10,China,>50K
32129,48,Private,350440,Some-college,10,Married-civ-spouse,Craft-repair,Other-relative,Asian-Pac-Islander,Male,0,0,40,Cambodia,>50K
32204,23,Local-gov,162551,Bachelors,13,Never-married,Prof-specialty,Own-child,Asian-Pac-Islander,Female,0,0,35,China,<=50K


In [119]:
#Replace values in a column - 'working-class' with new category labels.#Replace anyone with "State-gov" with "Government" and anyone with "Self-emp-not-inc" as "Private" and "Private" as "Private"
#.map()
#first specify dictionary
newVal = {"State-gov":"Government", "Private": "Private", "Self-emp-not-inc": "Private"}
adult_data['workclass'] = adult_data['workclass'].map(newVal)
adult_data.head(25)



Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,Government,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Private,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Private,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
