In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
from pymongo import MongoClient

In [2]:
# Read in baby names dataframe
baby_names_df = pd.read_csv('../Resources/Popular_Baby_Names.csv')
baby_names_df

Unnamed: 0,Year of Birth,Gender,Ethnicity,Child's First Name,Count,Rank
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53
...,...,...,...,...,...,...
69209,2012,MALE,BLACK NON HISP,CAYDEN,19,52
69210,2013,FEMALE,WHITE NON HISPANIC,Margaret,25,67
69211,2013,FEMALE,WHITE NON HISPANIC,Tamar,10,82
69212,2013,FEMALE,WHITE NON HISPANIC,Amanda,13,79


# Step 1 - Analyze Dataframe

In [3]:
# Display basic information about the dataframe
baby_names_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69214 entries, 0 to 69213
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year of Birth       69214 non-null  int64 
 1   Gender              69214 non-null  object
 2   Ethnicity           69214 non-null  object
 3   Child's First Name  69214 non-null  object
 4   Count               69214 non-null  int64 
 5   Rank                69214 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 3.2+ MB


In [4]:
# Display the number of rows and columns
baby_names_df.shape

(69214, 6)

In [5]:
# Get the count of names for each year
year_counts = baby_names_df['Year of Birth'].value_counts().sort_index()

# Display the counts
print("Count of names for each year:")
print(year_counts)

Count of names for each year:
Year of Birth
2011    13715
2012    13875
2013    13920
2014    14165
2015     2045
2016     2063
2017     1973
2018     1964
2019     1935
2020     1769
2021     1790
Name: count, dtype: int64


In [6]:
# Get the count by gender for the entire dataframe
gender_count = baby_names_df['Gender'].value_counts().sort_index()

# Display the counts
print("Count by Gender:")
print(gender_count)

Count by Gender:
Gender
FEMALE    35299
MALE      33915
Name: count, dtype: int64


In [7]:
# Get the count by gender for the entire dataframe
ethnicity_count = baby_names_df['Ethnicity'].value_counts().sort_index()

# Display the counts
print("Count by Ethnicity:")
print(ethnicity_count)

Count by Ethnicity:
Ethnicity
ASIAN AND PACI                 2483
ASIAN AND PACIFIC ISLANDER     9383
BLACK NON HISP                 2446
BLACK NON HISPANIC            10052
HISPANIC                      20365
WHITE NON HISP                 4843
WHITE NON HISPANIC            19642
Name: count, dtype: int64


In [8]:
# Get the count by name for the entire dataframe
name_count = baby_names_df['Child\'s First Name'].value_counts().sort_index()

# Display the counts
print("Count by Name:")
print(name_count)

Count by Name:
Child's First Name
AALIYAH    28
AARAV      14
AARON      53
AAYAN       7
ABBY       14
           ..
Zola        1
Zora        1
Zoya       12
Zuri       23
Zyaire      3
Name: count, Length: 3302, dtype: int64


# Step 2 - Preprocess Ethnicity Column

In [9]:
# Create a dictionary for mapping
ethnicity_map = {
    'ASIAN AND PACI': 'ASIAN AND PACIFIC ISLANDER',
    'BLACK NON HISP': 'BLACK NON HISPANIC',
    'WHITE NON HISP': 'WHITE NON HISPANIC'
}

# Replace the values using the mapping
baby_names_df['Ethnicity'] = baby_names_df['Ethnicity'].replace(ethnicity_map)

# Check the results
print("\nAfter consolidation:")
print(baby_names_df['Ethnicity'].value_counts().sort_index())


After consolidation:
Ethnicity
ASIAN AND PACIFIC ISLANDER    11866
BLACK NON HISPANIC            12498
HISPANIC                      20365
WHITE NON HISPANIC            24485
Name: count, dtype: int64


# Step 3 - Convert First Names to Upper Case

In [10]:
# Rename the column
baby_names_df = baby_names_df.rename(columns={"Child's First Name": "First Name"})

# Check current unique count of first names
print(f"Current unique name count: {baby_names_df['First Name'].nunique()}")

# Convert to uppercase
baby_names_df['First Name'] = baby_names_df['First Name'].str.upper()

# Check new unique count
print(f"New unique name count after standardization: {baby_names_df['First Name'].nunique()}")

# Verify change through random sampling
print(baby_names_df['First Name'].sample(10))

Current unique name count: 3302
New unique name count after standardization: 2021
14255       NOOR
35850     ZARIAH
67022    MICHAEL
35419     SAWYER
51137     THOMAS
6918       SARAH
61958       JOEL
40526    MARIANA
33401      LAURA
16956       MILA
Name: First Name, dtype: object


# Step 4 - Save preprocessed dataframe to a new CSV file

In [12]:
# Save the updated DataFrame to a new CSV file
baby_names_df.to_csv('../Output/baby_names_cleaned.csv', index=False)