Created: July 12, 2023

Cleans and aggregates NCCS core data files on public charities for 2000-2019. Specific focus on health charities as categorized by NTEE major groups.

In [6]:
import pandas as pd
import os

# note: nccskey was dropped as an included col
COLS = ["EIN", "FISYR", "NAME", "STATE", "NTEE1", "ADDRESS", "CITY", "ZIP", "SUBSECCD", "NTMAJ5", "NTMAJ10", "NTMAJ12", "MAJGRPB", "CONT" ]
DIR = '../data/working/'  

def clean(filename, year):
    path = os.path.join(DIR, filename)
    data = pd.read_csv(path, low_memory = False)

    # for consistency across column names
    data.columns = data.columns.str.upper()
    if 'CONT' not in data.columns.tolist():
        data = data.rename(columns = {'P1TCONT': 'CONT'})

    # filter data
    df = data[COLS]
    
    # TODO: only flter by ntmaj5?
    # note: no longer filtering years - want late filings included in data
    df = df[(df['CONT'] > 0) & (df['SUBSECCD'] == 3) & (df['NTMAJ5'] ==  "HE") & (df['NTMAJ10'] ==  "HE") 
            & df['NTMAJ12'].isin(["EH", "HE"])]
    
    return df

In [8]:
merged = None

for year in range(2000, 2020):   
    str_year = str(year)
    
    # only looking at public charities
    pc = clean('nccs.core' + str_year + 'pc.csv', year)
    
    if merged is None:
        merged = pc
    else:
        merged = pd.concat([merged, pc]).reset_index(drop = True)

# data set across all years
merged.to_csv("../data/distribution/merged_filtered.csv", index=False)