In [1]:
import csv

In [3]:
with open("pop_estimates.csv") as f:
    data = csv.reader(f)
    for _ in range(5):
        print(next(data))

['Geographic Area', 'July 1, 2001 Estimate', 'July 1, 2000 Estimate', 'April 1, 2000 Population Estimates Base']
['United States', ' 284,796,887 ', ' 282,124,631 ', ' 281,421,906 ']
['Alabama', ' 4,464,356 ', ' 4,451,493 ', ' 4,447,100 ']
['Alaska', ' 634,892 ', ' 627,601 ', ' 626,932 ']
['Arizona', ' 5,307,331 ', ' 5,165,274 ', ' 5,130,632 ']


In [5]:
def name_int(value: str) -> int:
    try: 
        return int(value.strip().replace(",", ""))
    except:
        raise ValueError("data could not be parsed into a valid integer")

In [6]:
from typing import Annotated
from pydantic import BeforeValidator

FunkyInt = Annotated[int, BeforeValidator(name_int)]

In [7]:
from pydantic import BaseModel

class Estimate(BaseModel):
    area: str
    july_1_2001: FunkyInt
    july_1_2000: FunkyInt
    april_1_2000: FunkyInt

In [8]:
with open("pop_estimates.csv") as f:
    data = csv.DictReader(f, fieldnames=["area", "july_1_2001", "july_1_2000", "april_1_2000"])
    for _ in range(5):
        print(next(data))

{'area': 'Geographic Area', 'july_1_2001': 'July 1, 2001 Estimate', 'july_1_2000': 'July 1, 2000 Estimate', 'april_1_2000': 'April 1, 2000 Population Estimates Base'}
{'area': 'United States', 'july_1_2001': ' 284,796,887 ', 'july_1_2000': ' 282,124,631 ', 'april_1_2000': ' 281,421,906 '}
{'area': 'Alabama', 'july_1_2001': ' 4,464,356 ', 'july_1_2000': ' 4,451,493 ', 'april_1_2000': ' 4,447,100 '}
{'area': 'Alaska', 'july_1_2001': ' 634,892 ', 'july_1_2000': ' 627,601 ', 'april_1_2000': ' 626,932 '}
{'area': 'Arizona', 'july_1_2001': ' 5,307,331 ', 'july_1_2000': ' 5,165,274 ', 'april_1_2000': ' 5,130,632 '}


In [9]:
def estimates():
    with open("pop_estimates.csv") as f:
        data = csv.DictReader(f, fieldnames=["area", "july_1_2001", "july_1_2000", "april_1_2000"])
        next(data)  # skip header row

        for row in data:
            yield Estimate.model_validate(row)

In [10]:
for estimate in estimates():
    print(estimate)

area='United States' july_1_2001=284796887 july_1_2000=282124631 april_1_2000=281421906
area='Alabama' july_1_2001=4464356 july_1_2000=4451493 april_1_2000=4447100
area='Alaska' july_1_2001=634892 july_1_2000=627601 april_1_2000=626932
area='Arizona' july_1_2001=5307331 july_1_2000=5165274 april_1_2000=5130632
area='Arkansas' july_1_2001=2692090 july_1_2000=2678030 april_1_2000=2673400
area='California' july_1_2001=34501130 july_1_2000=34000446 april_1_2000=33871648
area='Colorado' july_1_2001=4417714 july_1_2000=4323410 april_1_2000=4301261
area='Connecticut' july_1_2001=3425074 july_1_2000=3410079 april_1_2000=3405565
area='Delaware' july_1_2001=796165 july_1_2000=786234 april_1_2000=783600
area='District of Columbia' july_1_2001=571822 july_1_2000=571066 april_1_2000=572059
area='Florida' july_1_2001=16396515 july_1_2000=16054328 april_1_2000=15982378
area='Georgia' july_1_2001=8383915 july_1_2000=8229823 april_1_2000=8186453
area='Hawaii' july_1_2001=1224398 july_1_2000=1212281 apr

In [11]:
data = list(estimates())

In [12]:
data[0]

Estimate(area='United States', july_1_2001=284796887, july_1_2000=282124631, april_1_2000=281421906)