In [1]:
#DEVELOPING DATASET FOR EACH YEAR
#Goal: 

#To link [EIN, URL] from IRS to [EIN, NTEE1] from NCCS files, and get Mission or Purpose statements for each EIN.
#To- do List: 

#[+] 1. Get [EIN, URL] from IRS. -> df_irs 
#[+] 2. Get [EIN, NTEE1] from NCCS. -> df_nccs 
#[+] 3. Intersect [df_irs, df_nccs] -> df_inter -> "link'year'.csv" 
#[+] 4. Visit each URL in df_inter and get data from relevant tabs. 


import pandas as pd
import re, requests, string, os, gzip, pickle, sys
from bs4 import BeautifulSoup as bs
from tqdm import tqdm_notebook as tqdm
from multiprocessing import Pool
#use regex instead of beautifulsoup, if possible.

year_list = [2015, 2014, 2013, 2012, 2011]
year = year_list[0]

#When we create a data frame with pandas ≤ 0.19.2 and pickle it (using pickle.dump), 
#it is not possible to unpickle it using pandas 0.20.1.
#https://github.com/pandas-dev/pandas/issues/16474


#Create the file if it does not exist, if the file exists: jump to step 4
if(os.path.isfile("../Data/intermediary/link"+str(year)+".pkl.gz")==False):
    #Step 1. Get [EIN, URL] from IRS
    irsfile = pd.read_json('https://s3.amazonaws.com/irs-form-990/index_'+str(year)+'.json')
    ein_url=list(map(list, zip(*[[s['EIN'] for s in irsfile['Filings'+str(year)]], [s['URL'] for s in irsfile['Filings'+str(year)]]])))
    df_irs = pd.DataFrame(ein_url, columns=['EIN', 'URL'])


    #Step 2. Get [EIN, NTEE1] from NCCS
    df_nccs1 = pd.read_csv('https://nccs-data.urban.org/data/core/'+str(year)+'/nccs.core'+str(year)+'pc.csv')
    df_nccs = df_nccs1[['EIN', 'NTEE1']]


    #Step 3. Get common URL from df_irs and df_nccs and make corresponding list of [EIN, URL, NTEE1]
    df_nccs['EIN'] = df_nccs['EIN'].apply(str)
    df_inter = pd.DataFrame(pd.merge(df_nccs, df_irs, how='outer', on=['EIN']), columns=['EIN','NTEE1','URL'])
    df_inter.columns = ['EIN', 'NTEE', 'IRS_URL']

    #file size limit on Github - 25 MB
    df_inter.to_pickle('../Data/intermediary/link'+str(year)+'.pkl.gz', 'gzip')
    
    del irsfile, ein_url, df_irs, df_nccs1, df_nccs, df_inter


#Step 4: Provide list of tags to search from:

#Year	MissionTags	PurposeTags
#2015, 2014, 2013	ActivityOrMissionDesc	OtherExemptPurposeExpendGrp, TotalExemptPurposeExpendGrp
#2012, 2011	ActivityOrMissionDescription	OtherExemptPurposeExpenditures, TotalExemptPurposeExpenditures
#FormType	Mission	Purpose
#990	Yes	No
#990EZ	No	Yes
#990PF	No	No


#The original tag names are converted into small letters while parsing, for e.g. 'ActivityOrMissionDesc' is parsed as 'activityormissiondesc'.
#So, we will provide original tags converted into small letters for comparision.
#https://github.com/lecy/Open-Data-for-Nonprofit-Research/blob/master/Build_IRS990_E-Filer_Datasets/Data_Dictionary.md
#year| tag| line#

#alltags = ['ActivityOrMissionDesc', 'ActivityOrMissionDescription',
#           'OtherExemptPurposeExpenGrp', 'OtherExemptPurposeExpenditures',
#           'TotalExemptPurposeExpendGrp', 'TotalExemptPurposeExpenditures'] 
    
    
alltags = ['activityormissiondesc', 'activityormissiondescription',
           'otherexemptpurposeexpengrp', 'otherexemptpurposeexpenditures',
           'totalexemptpurposeexpendgrp', 'totalexemptpurposeexpenditures']


df = pd.read_pickle("../Data/intermediary/link"+str(year)+".pkl.gz", 'gzip')
df_inter = df[~ pd.isnull(df['IRS_URL'])]

masterdata=pd.DataFrame(columns=['EIN', 'NTEE', 'IRS_URL', 'TEXT', 'TEXTTYPE', 'YEAR'])

#Uncomment the line below while developind dataset from scratch: Only puts index values and creates such file
#masterdata.to_csv(open('../Data/'+str(year)+'/MasterData'+str(year)+'.csv.gzip', 'a'), index=False)

print("length: ",len(df_inter))

def build_coredata(r):
    print("Inside the function:", r)
    
    masterdata=pd.DataFrame(columns=['EIN', 'NTEE', 'IRS_URL', 'TEXT', 'TEXTTYPE', 'YEAR'])
    
    #for turn in tqdm(range(r,r+101)):
    for turn in range(r[0],r[1]):
        row = df_inter.values[int(turn)]
        flag = 0
        page = requests.get(row[2])

        bss = bs(page.text, 'html.parser')

        for tag in bss.find_all():
            if tag.name in alltags:
                masterdata.loc[len(masterdata)] = [str(row[0]), row[1], row[2], tag.string, tag.name, str(year)]
                flag = 1

        if(flag == 0):
            masterdata.loc[len(masterdata)] = [str(row[0]), row[1], row[2],'','', str(year)]            

    print(masterdata)    
    masterdata.to_csv(open('../Data/'+str(year)+'/MasterData'+str(year)+'.csv.gzip', 'a'), header=False, index=False)
    print("written: ", r)


no_urls = 100
#records = [[i, i+no_urls] for i in range(18000, 19000, no_urls)]
records = [[i, i+no_urls] for i in range(0, len(df_inter), no_urls)]

agents = 4
#Uncomment these two lines to build data
#with Pool(processes=agents) as pool:
#    pool.map(build_coredata, records)
   

no_url = df[pd.isnull(df['IRS_URL'])]
masterdata=pd.DataFrame(no_url, columns=['EIN', 'NTEE', 'IRS_URL', 'TEXT', 'TEXTTYPE', 'YEAR'])
#masterdata.to_csv(open('../Data/'+str(year)+'/MasterData'+str(year)+'.csv.gzip', 'a'), index=False)
print("Added data without URLs!")

if(os.path.isfile('../Data/'+str(year)+'/MasterData'+str(year)+'.pkl.gz')==False):
    finaldata = pd.read_csv('../Data/'+str(year)+'/MasterData'+str(year)+'.csv.gzip').drop_duplicates()
    finaldata = finaldata[df_dist.NTEE != 'NTEE']
    finaldata.to_pickle('../Data/'+str(year)+'/MasterData'+str(year)+'.pkl.gz')

length:  261034
Added data without URLs!
