# Demographic Data Analyzer

### Step 1: Loading the Data

In [1]:
import pandas as pd

# Load the data
df = pd.read_csv("adult.data.csv")

# Display the first 5 rows
df.head()



Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Step 2: Clean the Data

In [2]:
 # First, defining the column names:
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "salary"
]

# Reloading the data with column names:
df = pd.read_csv("adult.data.csv", header=None, names=column_names)

# Strip extra whitespace from strings:
# Remove leading/trailing spaces from string values
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Preview cleaned data:
df.head()


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
1,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
2,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
3,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
4,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K


In [3]:
# Since the CSV already has headers in row 0
df = pd.read_csv("adult.data.csv")

# Then apply the trimming:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Previewing it again:
df.head()


  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Step 3: Data Analysis.

In [4]:
# Step 3.1: How many people of each race are represented in this dataset?
# Using the value_counts() method:
race_count = df["race"].value_counts()
print(race_count)


race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64


In [5]:
# Step 3.2: What is the average age of men?
# Filter by sex, then use .mean():
average_age_men = df[df["sex"] == "Male"]["age"].mean()
print(round(average_age_men, 1))  # Rounded to 1 decimal place


39.4


In [7]:
#  Step 3.3: Percentage of people with advanced education (Bachelors, Masters, Doctorate) earning >50K
# Define advanced education list with Masters and Doctorate included
advanced_edu = ["Bachelors", "Masters", "Doctorate"]

# Filter people with advanced education
higher_edu = df[df["education"].isin(advanced_edu)]

# Filter those earning >50K
higher_edu_rich = higher_edu[higher_edu["salary"] == ">50K"]

# Calculate percentage
percentage_higher_edu_rich = (len(higher_edu_rich) / len(higher_edu)) * 100

print(round(percentage_higher_edu_rich, 1))


46.5


In [9]:
# Step 3.5 — Percentage of people without advanced education making >50K

# Step 3.5: Percentage without advanced education making >50K
advanced_edu = ["Bachelors", "Masters", "Doctorate"]

lower_edu = df[~df["education"].isin(advanced_edu)]
lower_edu_rich = lower_edu[lower_edu["salary"] == ">50K"]

percentage_lower_edu_rich = (len(lower_edu_rich) / len(lower_edu)) * 100
print(round(percentage_lower_edu_rich, 1))


17.4


In [10]:
# Step 3.6: What is the minimum number of hours a person works per week?

min_work_hours = df["hours-per-week"].min()
print(min_work_hours)


1


In [11]:
# Step 3.7: What percentage of people who work the minimum number of hours per week earn >50K?

# Filter people who work minimum hours
min_workers = df[df["hours-per-week"] == min_work_hours]

# Filter those earning >50K among minimum workers
rich_min_workers = min_workers[min_workers["salary"] == ">50K"]

# Calculate percentage
percentage_rich_min_workers = (len(rich_min_workers) / len(min_workers)) * 100
print(round(percentage_rich_min_workers, 1))


10.0


In [12]:
# Step 3.8: Which country has the highest percentage of people earning >50K, and what is that percentage?

# Total people per country
country_counts = df["native-country"].value_counts()

# People earning >50K per country
rich_country_counts = df[df["salary"] == ">50K"]["native-country"].value_counts()

# Calculate percentage rich per country
rich_percentage_by_country = (rich_country_counts / country_counts) * 100

# Country with highest percentage of rich
highest_earning_country = rich_percentage_by_country.idxmax()
highest_earning_country_percentage = rich_percentage_by_country.max()

print(highest_earning_country)
print(round(highest_earning_country_percentage, 1))


Iran
41.9


In [13]:
# Step 3.9: What is the most popular occupation for those who earn >50K in India?

rich_india = df[(df["native-country"] == "India") & (df["salary"] == ">50K")]
most_popular_occupation_india = rich_india["occupation"].mode()[0]
print(most_popular_occupation_india)


Prof-specialty
