Created: July 12, 2023

Cleans and aggregates NCCS core data files on public charities for 2000-2019. Specific focus on health charities as categorized by NTEE major groups.

In [20]:
import pandas as pd
import os

# note: nccskey was dropped as an included col
COLS = ["EIN", "FISYR", "NAME", "STATE", "NTEE1", "ADDRESS", "CITY", "ZIP", "SUBSECCD", "NTMAJ5", "NTMAJ10", "NTMAJ12", "MAJGRPB", "CONT",
        "DIRSUPESTIMATE"]
DIR = '../data/working/'  

def clean(filename, year):
    path = os.path.join(DIR, filename)
    data = pd.read_csv(path, low_memory = False)

    # for consistency across column names
    data.columns = data.columns.str.upper()
    if 'CONT' not in data.columns.tolist():
        data = data.rename(columns = {'P1TCONT': 'CONT'})

    # filter data
    df = data[COLS if ('DIRSUPESTIMATE' in data.columns.tolist()) else COLS[:-1]]
    
    # TODO: only flter by ntmaj5?
    # note: filtering out before 2000 and after 2019 since data is incomplete
    df = df[(df['CONT'] > 0) & (df['SUBSECCD'] == 3) & (df['NTMAJ5'] ==  "HE") & (df['FISYR'] < 2020) 
            & (df['FISYR'] > 1999)]
    
    return df

In [21]:
merged = None

for year in range(2000, 2020):   
    str_year = str(year)

    # only looking at public charities
    pc = clean('nccs.core' + str_year + 'pc.csv', year)
    
    # aggregate
    merged = pd.concat([merged, pc]).reset_index(drop = True)

merged = merged[merged['FISYR'] <= 2019]

# data set across all years
merged.to_csv("../data/distribution/merged_filtered.csv", index=False)

2014
2015
2016
2017
2018
2019
