In [1]:
import numpy as np
import pandas as pd

# Setting display options for pandas
pd.set_option("display.max.columns", 100)

# Importing necessary libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Suppressing warnings
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"
data = pd.read_csv(DATA_URL + "adult.data.csv")

# Display the first few rows (optional, for verification)
print(data.head())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country salary  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [3]:
# ----- QUESTION 1 -----
# How many men and women (sex feature) are represented in this dataset?
print("\nQuestion 1:")
print(data['sex'].value_counts())


Question 1:
sex
Male      21790
Female    10771
Name: count, dtype: int64


In [4]:
# ----- QUESTION 2 -----
# What is the average age (age feature) of women?
print("\nQuestion 2:")
average_age_women = data[data['sex'] == 'Female']['age'].mean()
print(f"Average age of women: {average_age_women}")


Question 2:
Average age of women: 36.85823043357163


In [5]:
# ----- QUESTION 3 -----
# What is the percentage of German citizens (native-country feature)?
print("\nQuestion 3:")
germans = data[data['native-country'] == 'Germany'].shape[0]
total_people = data.shape[0]
percentage_germans = (germans / total_people) * 100
print(f"Percentage of German citizens: {percentage_germans:.2f}%")



Question 3:
Percentage of German citizens: 0.42%


In [6]:

# ----- QUESTION 4-5 -----
# What are the mean and standard deviation of age for those who earn more than 50K per year and those who earn less?
print("\nQuestion 4-5:")
mean_age_above_50k = data[data['salary'] == '>50K']['age'].mean()
std_age_above_50k = data[data['salary'] == '>50K']['age'].std()
mean_age_below_50k = data[data['salary'] == '<=50K']['age'].mean()
std_age_below_50k = data[data['salary'] == '<=50K']['age'].std()
print(f"Mean age (above 50K): {mean_age_above_50k:.2f}, Std (above 50K): {std_age_above_50k:.2f}")
print(f"Mean age (below 50K): {mean_age_below_50k:.2f}, Std (below 50K): {std_age_below_50k:.2f}")


Question 4-5:
Mean age (above 50K): 44.25, Std (above 50K): 10.52
Mean age (below 50K): 36.78, Std (below 50K): 14.02


In [7]:

# ----- QUESTION 6 -----
# Is it true that people who earn more than 50K have at least high school education?
print("\nQuestion 6:")
high_education = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']
higher_earners = data[data['salary'] == '>50K']
education_check = all(higher_earners['education'].isin(high_education))
print(f"All higher earners have at least high school education: {education_check}")


Question 6:
All higher earners have at least high school education: False


In [8]:

# ----- QUESTION 7 -----
# Display age statistics for each race (race feature) and each gender (sex feature). Find the maximum age of men of Amer-Indian-Eskimo race.
print("\nQuestion 7:")
age_stats = data.groupby(['race', 'sex'])['age'].describe()
print(age_stats)
max_age_amer_indian_eskimo = data[(data['race'] == 'Amer-Indian-Eskimo') & (data['sex'] == 'Male')]['age'].max()
print(f"Max age of Amer-Indian-Eskimo men: {max_age_amer_indian_eskimo}")


Question 7:
                             count       mean        std   min   25%   50%  \
race               sex                                                       
Amer-Indian-Eskimo Female    119.0  37.117647  13.114991  17.0  27.0  36.0   
                   Male      192.0  37.208333  12.049563  17.0  28.0  35.0   
Asian-Pac-Islander Female    346.0  35.089595  12.300845  17.0  25.0  33.0   
                   Male      693.0  39.073593  12.883944  18.0  29.0  37.0   
Black              Female   1555.0  37.854019  12.637197  17.0  28.0  37.0   
                   Male     1569.0  37.682600  12.882612  17.0  27.0  36.0   
Other              Female    109.0  31.678899  11.631599  17.0  23.0  29.0   
                   Male      162.0  34.654321  11.355531  17.0  26.0  32.0   
White              Female   8642.0  36.811618  14.329093  17.0  25.0  35.0   
                   Male    19174.0  39.652498  13.436029  17.0  29.0  38.0   

                             75%   max  
race     

In [9]:
# ----- QUESTION 8 -----
# Among whom is the proportion of those who earn a lot (>50K) greater: married or single men?
print("\nQuestion 8:")
married_statuses = ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']
data['marital_category'] = np.where(data['marital-status'].isin(married_statuses), 'Married', 'Single')
men_data = data[data['sex'] == 'Male']
married_proportion = men_data[men_data['marital_category'] == 'Married']['salary'].value_counts(normalize=True)['>50K']
single_proportion = men_data[men_data['marital_category'] == 'Single']['salary'].value_counts(normalize=True)['>50K']
print(f"Proportion of >50K earners among married men: {married_proportion:.2f}")
print(f"Proportion of >50K earners among single men: {single_proportion:.2f}")



Question 8:
Proportion of >50K earners among married men: 0.44
Proportion of >50K earners among single men: 0.08


In [10]:
# ----- QUESTION 9 -----
# What is the maximum number of hours a person works per week? How many people work that many hours, and what is the percentage of those who earn a lot (>50K) among them?
print("\nQuestion 9:")
max_hours = data['hours-per-week'].max()
num_people_max_hours = data[data['hours-per-week'] == max_hours].shape[0]
percent_earning_above_50k = (data[(data['hours-per-week'] == max_hours) & (data['salary'] == '>50K')].shape[0] / num_people_max_hours) * 100
print(f"Max hours worked per week: {max_hours}")
print(f"Number of people working {max_hours} hours per week: {num_people_max_hours}")
print(f"Percentage of those earning >50K among them: {percent_earning_above_50k:.2f}%")


Question 9:
Max hours worked per week: 99
Number of people working 99 hours per week: 85
Percentage of those earning >50K among them: 29.41%


In [11]:
# ----- QUESTION 10 -----
# Group by 'native-country' and 'salary', then calculate the mean hours-per-week
avg_hours_per_country = data.groupby(['native-country', 'salary'])['hours-per-week'].mean()

# Print the results in a readable format
print("\nAverage hours per week by country and salary:")
for country, hours in avg_hours_per_country.items():
    if country[0] == "?":
        print("Native-country ({})".format(country[1]), f": {hours:.2f} hours/week")
    else:
        print(f"{country[0]} ({country[1]}): {hours:.2f} hours/week")



Average hours per week by country and salary:
Native-country (<=50K) : 40.16 hours/week
Native-country (>50K) : 45.55 hours/week
Cambodia (<=50K): 41.42 hours/week
Cambodia (>50K): 40.00 hours/week
Canada (<=50K): 37.91 hours/week
Canada (>50K): 45.64 hours/week
China (<=50K): 37.38 hours/week
China (>50K): 38.90 hours/week
Columbia (<=50K): 38.68 hours/week
Columbia (>50K): 50.00 hours/week
Cuba (<=50K): 37.99 hours/week
Cuba (>50K): 42.44 hours/week
Dominican-Republic (<=50K): 42.34 hours/week
Dominican-Republic (>50K): 47.00 hours/week
Ecuador (<=50K): 38.04 hours/week
Ecuador (>50K): 48.75 hours/week
El-Salvador (<=50K): 36.03 hours/week
El-Salvador (>50K): 45.00 hours/week
England (<=50K): 40.48 hours/week
England (>50K): 44.53 hours/week
France (<=50K): 41.06 hours/week
France (>50K): 50.75 hours/week
Germany (<=50K): 39.14 hours/week
Germany (>50K): 44.98 hours/week
Greece (<=50K): 41.81 hours/week
Greece (>50K): 50.62 hours/week
Guatemala (<=50K): 39.36 hours/week
Guatemala (>