### Import packages

In [4]:
import json 
import requests
import numpy as np
import pandas as pd
import time 
import random
from bs4 import BeautifulSoup as BS

In [5]:
# I created a .py file that has both the api id and api key. Here, imported them as the variables app_id and app_key
from keys import app_id, app_key

In [6]:
# Using beautiful soup, I scraped the state abbreviations and created a DataFrame. 

url='https://www23.statcan.gc.ca/imdb/p3VD.pl?Function=getVD&TVD=53971'
page = requests.get(url)
bs = BS(page.content, 'html.parser')
states = pd.DataFrame([[i.text for i in x.findAll('td')] for x in bs.table.findAll('tr')][1:])[2].unique()

### Making API calls with Charity Navigator API

In [7]:
#Instantiated two empty list to populate with the api calls
datarating, databadlist = [] , []

#Created a for loop using the states abbreviations and nested another for loop within in it to iterate through pages. If the status code is successful (status_code = 200) then it will
#append the the another empty list I instantiated within the nested loop called lst. It will grab the data I have designated and if the data point is complete it will append the datarating
#list above, if the organization is missing any of the designated information it will then append the databadlist. If the response isn't 200, the code will print the state and the page
#where it broke.
for state in states:
    for page in range(1,100):
        url = f'https://api.data.charitynavigator.org/v2/Organizations?app_id={app_id}&app_key={app_key}&rated=true&state={state}&pageSize=1000&pageNum={page}'
        response = requests.get(url)        
        if response.status_code == 200:
            polls = response.json()
            for x in polls:
                try:
                    lst = []
                    lst.append(x['mission'])
                    lst.append(x['tagLine'])
                    lst.append(x['charityName'])
                    lst.append(x['category']['categoryName'])
                    lst.append(x['category']['categoryID'])
                    lst.append(x['cause']['causeName'])
                    lst.append(x['cause']['causeID'])
                    lst.append(x['mailingAddress']['city'])
                    lst.append(x['mailingAddress']['stateOrProvince'])
                    lst.append(x['mailingAddress']['postalCode'])
                    lst.append(x['currentRating']['score'])
                    lst.append(x['currentRating']['rating'])
                    lst.append(x['advisories']['severity'])
                    current_accountability_rating = x['currentRating']['accountabilityRating']
                    lst.append(current_accountability_rating['score'])
                    lst.append(current_accountability_rating['rating'])
                    lst.append(x['irsClassification']['nteeType'])
                    lst.append(x['irsClassification']['classification'])
                    lst.append(x['irsClassification']['affiliation'])
                    lst.append(x['irsClassification']['foundationStatus'])
                    lst.append(x['irsClassification']['nteeClassification'])
                    lst.append(x['irsClassification']['deductibility'])
                    lst.append(x['irsClassification']['subsection'])
                    lst.append(x['irsClassification']['assetAmount'])
                    lst.append(x['irsClassification']['incomeAmount'])
                    current_financial_rating = x['currentRating']['financialRating']
                    lst.append(current_financial_rating['score'])
                    lst.append(current_financial_rating['rating'])
                    datarating.append(lst)
                except:
                    databadlist.append(x)
        else:
            print(state, page)
            break

# The timer is used to make sure it pulls the information in time intervals to make it appear as 'human' as possible.        
        time.sleep(random.choice([x/10 for x in range(7,13)]))
    
# This turns the datarating list into a data frame and creates a .csv file that I can import into another notebook  
    df = pd.DataFrame(datarating)
#    df.to_csv('datarating.csv')

AL 2
AK 2
AZ 2
AR 2
CA 3
CO 2
CT 2
DE 2
DC 2
FL 2
GA 2
HI 2
ID 2
IL 2
IN 2
IA 2
KS 2
KY 2
LA 2
ME 2
MD 2
MA 2
MI 2
MN 2
MS 2
MO 2
MT 2
NE 2
NV 2
NH 2
NJ 2
NM 2
NY 2
NC 2
ND 2
OH 2
OK 2
OR 2
PA 2
RI 2
SC 2
SD 2
TN 2
TX 2
UT 2
VT 2
VA 2
WA 2
WV 2
WI 2
WY 2


In [9]:
# The above code returned 9005 data points, I wanted to see if I can pull additional data so I used a similar code but did not iterate through the states to get different data points
datarating2, databadlist2 = [] , []

for page in range(1,100):
    url = f'https://api.data.charitynavigator.org/v2/Organizations?app_id={app_id}&app_key={app_key}&rated=true&pageSize=1000&pageNum={page}'
    response = requests.get(url)
    if response.status_code == 200:
        polls = response.json()
        for x in polls:
            try:
                lst = []
                lst.append(x['mission'])
                lst.append(x['tagLine'])
                lst.append(x['charityName'])
                lst.append(x['category']['categoryName'])
                lst.append(x['category']['categoryID'])
                lst.append(x['cause']['causeName'])
                lst.append(x['cause']['causeID'])
                lst.append(x['mailingAddress']['city'])
                lst.append(x['mailingAddress']['stateOrProvince'])
                lst.append(x['mailingAddress']['postalCode'])
                lst.append(x['currentRating']['score'])
                lst.append(x['currentRating']['rating'])
                lst.append(x['advisories']['severity'])
                current_accountability_rating = x['currentRating']['accountabilityRating']
                lst.append(current_accountability_rating['score'])
                lst.append(current_accountability_rating['rating'])
                lst.append(x['irsClassification']['nteeType'])
                lst.append(x['irsClassification']['classification'])
                lst.append(x['irsClassification']['affiliation'])
                lst.append(x['irsClassification']['foundationStatus'])
                lst.append(x['irsClassification']['nteeClassification'])
                lst.append(x['irsClassification']['deductibility'])
                lst.append(x['irsClassification']['subsection'])
                lst.append(x['irsClassification']['assetAmount'])
                lst.append(x['irsClassification']['incomeAmount'])
                current_financial_rating = x['currentRating']['financialRating']
                lst.append(current_financial_rating['score'])
                lst.append(current_financial_rating['rating'])
                datarating2.append(lst)
            except:
                databadlist2.append(x)
    else:
        print(page)
        break
    time.sleep(random.choice([x/10 for x in range(7,13)]))
df2 = pd.DataFrame(datarating2)
#df2.to_csv('datarating2.csv')

11


In [13]:
# In the above code I was able to pull an additional 8958 data points for a total of 17963 rows

### Creating a Final DataFrame and .csv file

In [14]:
#Concatinating both of the dataframes into one, called df_concat
df_concat = pd.concat([df, df2])

#I want to make sure that all of the data points are unique, I drop the duplicates and end up with 10851 rows
df_concat = df_concat.drop_duplicates().reset_index()

#I then take that dataframe and turn it into a .csv file
df_concat.to_csv('final_df.csv')