In [17]:
import pandas as pd
from pathlib import Path

In [32]:
processed_path = Path(r'C:\Users\Nick\Documents\Projects\babynames\data\processed')
babynames = pd.read_csv(processed_path / 'ssa_babynames_1880_2024.csv')

In [33]:
babynames['sex'] = babynames['sex'].astype('category')
babynames['year'] = babynames['year'].astype(int)
babynames['count'] = babynames['count'].astype(int)

In [34]:
print(f'Shape: {babynames.shape}')
print(f'\nColumns: {list(babynames.columns)}')
print(f'\nDtypes:\n{babynames.dtypes}')

Shape: (2149477, 4)

Columns: ['name', 'sex', 'count', 'year']

Dtypes:
name       object
sex      category
count       int64
year        int64
dtype: object


In [5]:
babynames.head(10)

Unnamed: 0,name,sex,count,year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880
5,Margaret,F,1578,1880
6,Ida,F,1472,1880
7,Alice,F,1414,1880
8,Bertha,F,1320,1880
9,Sarah,F,1288,1880


In [6]:
print(f'Missing values:\n{babynames.isnull().sum()}')

Missing values:
name     0
sex      0
count    0
year     0
dtype: int64


In [7]:
print('Year distribution:')
print(f'  Range: {babynames["year"].min()} - {babynames["year"].max()}')
print(f'  Unique years: {babynames["year"].nunique()}')

Year distribution:
  Range: 1880 - 2024
  Unique years: 145


In [8]:
sex_dist = babynames['sex'].value_counts()
print(f'Sex distribution:\n{sex_dist}')

Sex distribution:
sex
F    1263426
M     886051
Name: count, dtype: int64


In [9]:
top_names = babynames.groupby('name')['count'].sum().sort_values(ascending=False).head(20)
print('Top 20 names overall:\n', top_names)

Top 20 names overall:
 name
James          5262396
John           5196210
Robert         4866007
Michael        4440391
William        4205026
Mary           4154332
David          3682683
Joseph         2672746
Richard        2585535
Charles        2441151
Thomas         2360128
Christopher    2074303
Daniel         1982857
Elizabeth      1687089
Matthew        1652565
Patricia       1578408
George         1494369
Anthony        1489090
Jennifer       1476028
Linda          1458588
Name: count, dtype: int64


In [10]:
yearly_counts = babynames.groupby('year')['count'].sum()
print('Total births by year (first 10, last 10):')
print(yearly_counts.head(10))
print(yearly_counts.tail(10))

Total births by year (first 10, last 10):
year
1880    201484
1881    192688
1882    221533
1883    216946
1884    243461
1885    240854
1886    255316
1887    247392
1888    299474
1889    288946
Name: count, dtype: int64
year
2015    3700912
2016    3668698
2017    3576195
2018    3515897
2019    3470933
2020    3340577
2021    3387949
2022    3383135
2023    3311196
2024    3328501
Name: count, dtype: int64


In [11]:
babynames['decade'] = (babynames['year'] // 10) * 10
top_by_decade = babynames.groupby(['decade', 'sex'])['count'].sum().unstack()
print('Top counts by decade and sex:\n', top_by_decade)

Top counts by decade and sex:
 sex            F         M
decade                    
1880     1312691   1095403
1890     2221563   1140948
1900     2927552   1357572
1910     8156259   6675195
1920    11952674  11019385
1930    10662972  10566422
1940    14484760  14886639
1950    19234425  20217185
1960    18264847  19262183
1970    15452435  16517372
1980    17175656  18459573
1990    18004961  19479160
2000    18475326  19959058
2010    17497239  18798337
2020     8109819   8641539


  top_by_decade = babynames.groupby(['decade', 'sex'])['count'].sum().unstack()


In [12]:
unique_names_per_year = babynames.groupby('year')['name'].nunique()
print('Unique names per year (first 10, last 10):')
print(unique_names_per_year.head(10))
print(unique_names_per_year.tail(10))

Unique names per year (first 10, last 10):
year
1880    1889
1881    1829
1882    2012
1883    1962
1884    2158
1885    2139
1886    2225
1887    2215
1888    2454
1889    2390
Name: name, dtype: int64
year
2015    30662
2016    30475
2017    30101
2018    29693
2019    29512
2020    28864
2021    28991
2022    29303
2023    29054
2024    29225
Name: name, dtype: int64


In [13]:
print(f'\nSummary Statistics:\n{babynames.describe()}')


Summary Statistics:
              count          year        decade
count  2.149477e+06  2.149477e+06  2.149477e+06
mean   1.730696e+02  1.979665e+03  1.975236e+03
std    1.463576e+03  3.515923e+01  3.533477e+01
min    5.000000e+00  1.880000e+03  1.880000e+03
25%    7.000000e+00  1.956000e+03  1.950000e+03
50%    1.200000e+01  1.990000e+03  1.990000e+03
75%    3.200000e+01  2.008000e+03  2.000000e+03
max    9.969300e+04  2.024000e+03  2.020000e+03
