In [1]:
import pandas as pd
from pathlib import Path
import re

In [2]:
raw_path = Path(r'C:\Users\Nick\Documents\Projects\babynames\data\raw\babynames data')
processed_path = Path(r'C:\Users\Nick\Documents\Projects\babynames\data\processed')
processed_path.mkdir(parents=True, exist_ok=True)

In [3]:
files = sorted(raw_path.glob('yob*.txt'))
print(f'Found {len(files)} files')

Found 145 files


In [4]:
all_data = []

for file in files:
    year = int(re.search(r'yob(\d{4})', file.name).group(1))
    df = pd.read_csv(file, header=None, names=['name', 'sex', 'count'])
    df['year'] = year
    all_data.append(df)

babynames = pd.concat(all_data, ignore_index=True)

In [5]:
babynames['count'] = babynames['count'].astype(int)
babynames['sex'] = babynames['sex'].astype('category')
babynames['year'] = babynames['year'].astype(int)

In [6]:
print('Year range check:', babynames['year'].min(), '-', babynames['year'].max())
print('Sex values:', babynames['sex'].unique())
print('Negative counts:', (babynames['count'] < 0).sum())
print('Missing values:', babynames.isnull().sum().sum())

Year range check: 1880 - 2024
Sex values: ['F', 'M']
Categories (2, object): ['F', 'M']
Negative counts: 0
Missing values: 0


In [7]:
output_file = processed_path / 'ssa_babynames_1880_2024.csv'
babynames.to_csv(output_file, index=False)

In [8]:
print(f'Total records: {len(babynames):,}')
print(f'Saved to: {output_file}')
print(f'Years: {babynames["year"].min()} - {babynames["year"].max()}')

Total records: 2,149,477
Saved to: C:\Users\Nick\Documents\Projects\babynames\data\processed\ssa_babynames_1880_2024.csv
Years: 1880 - 2024
