In [6]:
from pathlib import Path

import pandas as pd

In [4]:
parent_dir = Path.cwd()

In [5]:
names_dir = parent_dir / "names"

In [7]:
def parse_baby_names(data_dir: Path, start_year: int = 1880, end_year: int = 2023) -> pd.DataFrame:
    """
    Parses baby name files from yob<year>.txt into a single DataFrame.

    Parameters:
        data_dir (Path): Path to directory containing yob<year>.txt files
        start_year (int): Starting year of data
        end_year (int): Ending year of data

    Returns:
        pd.DataFrame: Combined DataFrame with columns ['Name', 'Gender', 'Count', 'Year']
    """
    all_data = []

    for year in range(start_year, end_year + 1):
        file_path = data_dir / f'yob{year}.txt'
        if not file_path.exists():
            print(f"Warning: {file_path} does not exist. Skipping.")
            continue
        
        df = pd.read_csv(file_path, header=None, names=["Name", "Gender", "Count"])
        df["Year"] = year
        all_data.append(df)

    return pd.concat(all_data, ignore_index=True)

In [8]:
baby_names_df = parse_baby_names(names_dir)

In [11]:
baby_names_df.head()

Unnamed: 0,Name,Gender,Count,Year
0,Mary,F,7065,1880
1,Anna,F,2604,1880
2,Emma,F,2003,1880
3,Elizabeth,F,1939,1880
4,Minnie,F,1746,1880


In [None]:
baby_names_df.count()