Get Kiva data
=====

In [17]:
reset -fs

In [18]:
import os
import json
import logging
import requests
import time
from urllib.request import urlopen, Request

from pandas.io.json import json_normalize
import pandas as pd

In [19]:
r = requests.get('http://api.kivaws.org/v1/loans/search.json?country_code=NG&per_page=500')

In [29]:
r.status_code

403

In [21]:
def extract_loans(n_pages, country_iso_code):

    loans_full = pd.DataFrame()
    loans_details = pd.DataFrame()
    
    for n in range(1, n_pages+1):
        print(f"Requesting {country_iso_code} page: {n}")
        n_per_page = 10
        url = f'http://api.kivaws.org/v1/loans/search.json?country_code={country_iso_code}&per_page={n_per_page}&page={n}'
        r = requests.get(url)
        if r.status_code = 403:
            print("Too many requests. Slow down")
        data = json.loads(r.text)
        loans = json_normalize(data['loans'])
        
        if data['paging']['total'] == 0:
            break # Stop if there are no loans
            
        # Take the loan id column from our retrieved loan data, and use it to pull additional details
        # about our loans. We store this in a seperate table loans_details
        
        loan_ids = loans['id'].tolist()
        loan_ids_str = ','.join(str(e) for e in loan_ids)
        r = requests.get('https://api.kivaws.org/v1/loans/'+loan_ids_str+'.json')
        ld_data = json.loads(r.text)
        loans_ext = json_normalize(ld_data['loans'])
        loans_details = loans_details.append(loans_ext, ignore_index=True)
         
        loans_full = loans_full.append(loans,ignore_index=True)
        print("The number of loans on current page: {}".format(len(loans_full.index)))
        time.sleep(1) # Wait a little while to so we don't overload Kiva servers
        
    return loans_full, loans_details

In [22]:
country_codes_without_data = ["DZ", "AO", "SH"]
country_codes_with_data = ["BJ", "BW", "BF", "BI", "CM", "CV", "CF", "TD", "KM", "CG", "CD", "DJ", "EG", "GQ", "ER", "SZ", "ET", "GA", "GM", "GH", "GN", "GW", "CI", "KE", "LS", "LR", "LY", "MG", "MW", "ML", "MR", "MU", "YT", "MA", "MZ", "NA", "NE", "NG", "ST", "RE", "RW", "ST", "SN", "SC", "SL", "SO", "ZA", "SS", "SH", "SD", "SZ", "TZ", "TG", "TN", "UG", "CD", "ZM", "TZ", "ZW"]

In [23]:
loans_full_complete = pd.DataFrame()
loans_details_complete = pd.DataFrame()

for code in country_codes_with_data:
    loans_full, loans_details = extract_loans(n_pages=1, country_iso_code=code)  
    loans_full_complete.append(loans_full)
    loans_details_complete.append(loans_details)
    break

Requesting BJ page: 1


KeyError: 'loans'

In [None]:
loans_details

In [None]:
loans_full_complete.shape

# TODO: Refactor below this cell 

-----

Let's save our data!

In [None]:
path = 'data/'
loans_full.to_csv(path+'loans_full_large.csv')
loans_details.to_csv(path+'loans_details.csv')

We will also pull data about Kiva's partners, who are the intermediaries between Kiva and end recipients for the majority of loans made.

In [None]:
data_path = '~/intro_to_machine_learning/data'
df=pd.read_csv(data_path+'/loans_details.csv', low_memory=False)
df.head()

In [None]:
df=df[df['partner_id'].notnull()]
df['partner_id']=df['partner_id'].astype(int).astype(str)
partner_id=df['partner_id'].unique().tolist()

In [None]:
d = r.get('https://api.kivaws.org/v1/partners/322.json?app_id=org.deltanalytics')

In [None]:
d.headers

In [None]:
d.json();

In [None]:
def extract_loan_partners(partner_ids):
    loan_partner_details=pd.DataFrame()
    for n in partner_ids:
        d = r.get('https://api.kivaws.org/v1/partners/'+n+'.json?app_id=org.deltanalytics')
        data = json.loads(d.text)
        partners=json_normalize(data['partners'])
        loan_partner_details=loan_partner_details.append(partners, ignore_index=True)
    
    return loan_partner_details  

In [None]:
loan_partner_details=extract_loan_partners(partner_id)

In [None]:
loan_partner_details.to_csv('~/intro_to_machine_learning/data/loans_partner_details.csv')

## Pulling each loan's lender details

Now that we have selected a subset of loans to focus in on, we want to pull all of the lenders of these loans. We do so here by first creating a list of the loans we are interested in. 

In [None]:
loan_ids = df['id'].astype(int).astype(str)
loan_ids = loan_ids.unique().tolist()
loan_ids[0:5]

Here we define a function that will extract details of every lender who has contributed to a single loan, and then loop it over the list of loans that we created in the previous step:

In [None]:
def extract_loan_lenders(loan_ids):
    loan_lenders_details=pd.DataFrame()
    for n in loan_ids:
        d = r.get('https://api.kivaws.org/v1/loans/'+n+'/lenders.json?app_id=org.deltanalytics')
        data = json.loads(d.text)
        if len(data['lenders']) != 0:
            lenders=json_normalize(data['lenders'])
        else: lenders = pd.DataFrame()
        lenders['loan_id'] = n
        loan_lenders_details = loan_lenders_details.append(lenders, ignore_index=True)

    return loan_lenders_details  

In [None]:
loan_lenders_details = extract_loan_lenders(loan_ids)

We then write the output to a data frame. Let's take a look at what we've got! 

In [None]:
loan_lenders_details.head(100)

In [None]:
loan_lenders_details.to_csv('~/intro_to_machine_learning/data/loans_lenders_details.csv')