# Consolitated Door Counts: 2010 to 2019

The purpose of this notebook is to produce a single csv file that has the door count information for the branches of the Seattle Public Library. The count information is totalled for each branch by month.  The final csv file has the following columns:

- Branch name
- Visitors during normal "open hours"
- Visitors during designated "closed hours"
- Total visitors in the month
- The time period (month) covered

The final compilation of the data is in a gist [here](https://gist.githubusercontent.com/Cameron-Grams/565526991400ade865e48d1aebb7cdd5/raw/61394eb466550c239a56e7f83eef1a66e4130329/door_counts_all)

Use of the compilation url is demonstrated in the Library Use notebook.  

In [1]:
import pandas as pd
import numpy as np
from tabula import read_pdf

from datetime import datetime as dt

In [None]:
df10 = read_pdf('./years/y2010.pdf', pages = 'all')
df11 = read_pdf('./years/y2011.pdf', pages = 'all')
df12 = read_pdf('./years/y2012.pdf', pages = 'all')
df13 = read_pdf('./years/y2013.pdf', pages = 'all')
df14 = read_pdf('./years/y2014.pdf', pages = 'all')
df15 = read_pdf('./years/y2015.pdf', pages = 'all')
df16 = read_pdf('./years/y2016.pdf', pages = 'all')
df17 = read_pdf('./years/y2017.pdf', pages = 'all')
df18 = read_pdf('./years/y2018.pdf', pages = 'all')
df19 = read_pdf('./years/y2019.pdf', pages = 'all')

In [None]:
months = [
"jan",
"feb_1",
"feb_2",
"mar",
"apr_1",
"apr_2",
"may_1",
"may_2",
"june",
"july_1",
"july_2",
"aug",
"sept_1",
"sept_2",
"oct_1",
"oct_2",
"nov",
"dec_1",
"dec_2"
        ]

In [None]:
months_obj = {
"jan": (0, 1, 29, 1),
"feb_1": (0, 29, 46, 2),
"feb_2": (1, 1, 12, 2),
"mar": (1, 12, 40, 3),
"apr_1": (1, 40, 47, 4),
"apr_2": (2, 1, 22, 4),
"may_1": (2, 22, 47, 5),
"may_2": (3, 1, 4, 5),
"june": (3, 4, 32, 6),
"july_1": (3, 32, 47, 7),
"july_2": (4, 1, 14, 7),
"aug": (4, 14, 42, 8),
"sept_1": (4, 42, 47, 9),
"sept_2": (5, 1, 24, 9),
"oct_1": (5, 24, 47, 10),
"oct_2": (6, 1, 6, 10),
"nov": (6, 6, 34, 11),
"dec_1": (6, 34, 47, 12),
"dec_2": (7, 1, 16, 12)
        }

In [None]:
def format_month(df, begin, end, month, year):
    df['time_period'] = dt.strptime( f"{month} {year}", "%m %Y")
    df = df.iloc[begin: end]

    df = df.rename({
        'Unnamed: 0': 'open_hours_count', 
        'Door Count (Visits)': 'closed_hours_count',
        'Unnamed: 1': 'Total_visits'
        }, axis = 1)

    keep_columns = [
        'Branch', 
        'open_hours_count', 
        'closed_hours_count', 
        'Total_visits', 
        'time_period'
        ]

    return df[keep_columns]

In [None]:
def audit(df):
    problems = []
    branches = list(df['Branch'].unique())
    for branch in branches:
        selection = df[df['Branch'] == branch]
        problems.append((branch, len(selection)))
    return problems

In [None]:
def format_year(df_list, year, months= months, months_obj = months_obj):
    year_df = []
    for month in months:
        df, begin, end, m = months_obj[month]
        new_month = format_month(df_list[df], begin, end, m, year)
        year_df.append(new_month)
    year = pd.concat(year_df)
    return year


In [None]:
y2010 = format_year(df10, 2010)

In [None]:
y2011 = format_year(df11, 2011)

In [None]:
y2012 = format_year(df12, 2012)

In [None]:
y2013 = format_year(df13, 2013)

In [None]:
y2014 = format_year(df14, 2014)

In [None]:
y2015 = format_year(df15, 2015) # <-- RBE has only 9 months of data

In [None]:
y2016 = format_year(df16, 2016)

In [None]:
y2017 = format_year(df17, 2017)

In [None]:
y2018 = format_year(df18, 2018)

In [None]:
y2019 = format_year(df19, 2019)

In [None]:
# audit(y2019)

In [None]:
all_years_dfs = [
    y2010,
    y2011,
    y2012, 
    y2013,
    y2014,
    y2015,
    y2016,
    y2017,
    y2018,
    y2019
]

In [None]:
all_years = pd.concat(all_years_dfs)

In [None]:
all_years.to_csv(path_or_buf='all_years.csv', index=False)