<a href="https://colab.research.google.com/github/CaptainPramil/data100/blob/main/data100_lab2_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
import zipfile
from pathlib import Path
import time

In [2]:
def fetch_and_cache(data_url, file, data_dir="data", force=False):
    
    data_dir = Path(data_dir)
    data_dir.mkdir(exist_ok=True)
    file_path = data_dir/Path(file)
    
    if force and file_path.exists():
        file_path.unlink()
    
    if force or not file_path.exists():
        print('Downloading...', end=' ')
        resp = requests.get(data_url)
        with file_path.open('wb') as f:
            f.write(resp.content)
        print('Done!')
    
    else:
        created = time.ctime(file_path.stat().st_ctime)
        print("Using cached version downloaded at", created)
    
    return file_path

In [3]:
data_url = 'https://www.ssa.gov/oact/babynames/state/namesbystate.zip'
namesbystate_path = fetch_and_cache(data_url, 'namesbystate.zip')
zf = zipfile.ZipFile(namesbystate_path, 'r')

column_labels = ['State', 'Sex', 'Year', 'Name', 'Count']

Downloading... Done!


In [6]:
def load_dataframe_from_zip(zf, f):
    with zf.open(f) as fh: 
        return pd.read_csv(fh, header=None, names=column_labels)

states = [
    load_dataframe_from_zip(zf, f)
    for f in sorted(zf.filelist, key=lambda x:x.filename) 
    if f.filename.endswith('.TXT')
]

In [8]:
baby_names = states[0]
for state_df in states[1:]:
    baby_names = pd.concat([baby_names, state_df])
baby_names = baby_names.reset_index().iloc[:, 1:]
ca = baby_names[baby_names['State'] == 'CA']

In [9]:
ca

Unnamed: 0,State,Sex,Year,Name,Count
384980,CA,F,1910,Mary,295
384981,CA,F,1910,Helen,239
384982,CA,F,1910,Dorothy,220
384983,CA,F,1910,Margaret,163
384984,CA,F,1910,Frances,134
...,...,...,...,...,...
772756,CA,M,2019,Zayvion,5
772757,CA,M,2019,Zeek,5
772758,CA,M,2019,Zhaire,5
772759,CA,M,2019,Zian,5


In [15]:
print(ca.shape,'is the shape of table')
print(ca.size,'is the total elements of table')

(387781, 5) is the shape of table
1938905 is the total elements of table


In [18]:
num_of_names_per_year = ca["Year"].value_counts()
num_of_names_per_year.sort_index

<bound method Series.sort_index of 2007    7250
2008    7158
2009    7121
2006    7075
2010    7010
        ... 
1914     710
1913     613
1912     558
1911     393
1910     363
Name: Year, Length: 110, dtype: int64>