In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

In [24]:
def main():
   # bring in files and create a large standardized list for dataframes to count and aggregate

    source_files = sorted(Path(r'C:\Users\brian\Project\Project\Names\data\babynames').glob("*.txt"))
    dataframes = []
    for file in source_files:
        df = pd.read_csv(file, header=None, names=["name", "sex", "n"])
        df["year"] = int(file.stem.replace("yob", ""))
        dataframes.append(df)

    df = pd.concat(dataframes)
    df2 = df.copy()
    df2 = df2.rename(columns={"n":"count"})
    df2.to_csv("big_list.csv", index=False)
    names = df2.copy()
    for name in names:
        names['Decade']= names['year'].apply(lambda year:int(year/10)*10)
   
    names.to_csv("decade_list.csv", index=False)
# group and find the most popular year in name frequency


    popular_years = (
        df.merge(
            df.groupby(["name", "sex"], as_index=False)["n"].max(),
            how="inner",
            on=["name", "sex", "n"],
        )
        .groupby(["name", "sex"], as_index=False)["year"]
        .max()
        .rename(columns={"year": "year_pop"}, inplace=False)
    )
# First and Recent year and total sum 
  
    df = (
        df.groupby(["name", "sex"], as_index=False)
        .agg({"year": ["min", "max"], "n": "sum"})
        .sort_values(by=["sex", ("n", "sum"), "name"], ascending=[True, False, True])
    )

    df.columns = df.columns.to_flat_index().map(lambda x: "_".join(x).strip("_"))


    df = df.merge(
        popular_years, how="left", left_on=["name", "sex"], right_on=["name", "sex"]
    )

    ## combine all names:
    
    ## use min / max as appropriate (e.g., popular years, etc.)
    df.to_csv("all-names.csv", index=False)

# Break out list by gender to separate csv files  
    for sex in ["M", "F"]:
        gender = df[df["sex"] == sex]
        gender = df[df["sex"] == sex]#.drop("sex", axis=1)
        gender.insert(
            3,
            "n_percent",
            100 * (gender["n_sum"].cumsum() / gender["n_sum"].sum()).round(3),
        )
        gender.insert(0, "rank", gender["n_sum"].rank(method="dense", ascending=False))
        gender.to_csv(f"{'boys' if sex == 'M' else 'girls'}.csv", index=False)


if __name__ == "__main__":
    main()

