Get Kiva data
=====

In [64]:
reset -fs

In [65]:
import os
import json
import logging
import requests
import time
from urllib.request import urlopen, Request

from pandas.io.json import json_normalize
import pandas as pd

In [66]:
# # # Debugging code
# for code in [None]:
#     print(code)
#     url = f'http://api.kivaws.org/v1/loans/search.json?country_code={code}&per_page=5'
#     r = requests.get(url)
#     # r.status_code
#     print(r.json()['loans'][0])
#     break

In [67]:
# Africian contry codes
country_codes_without_data = ["DZ", "AO", "SH", "CV", "CF", "TD", "KM", "DJ","GQ", "ER", "SZ",  "ET",  "GA", "GM", "GN",  "GW",    "LY",  "MU", "YT", "MA",  "NE",  "ST",  "RE", "ST",   "SC",  "SH", "SD","SZ","TN",  ]

country_codes_with_data = ["BJ", "BW", "BF", "BI", "CM", "CG", "CD", "EG", 
                           "GH", "CI", "KE", "LS", "LR", "MG", "MW", "ML", 
                           "MR", "MZ", "NA", "NG", "RW", "SN", "SL", "SO",
                           "ZA", "SS", "TZ", "TG", "UG", "CD", "ZM", "TZ", "ZW"]

In [68]:
loans_complete = pd.DataFrame()
print("Requesting…")

for country_iso_code in country_codes_with_data:   
#     n_pages = 1
#     for page in range(1, n_pages+1):
        page = 1
        print(f"Country: {country_iso_code}; Page: {page}")
        n_per_page = 10
        url = f'http://api.kivaws.org/v1/loans/search.json?country_code={country_iso_code}&per_page={n_per_page}&page={page}'
        r = requests.get(url)
        if r.status_code == 403:
            raise Exception("Too many requests. Slow down 🐢")
        
        data = json.loads(r.text)
        if data['paging']['total'] == 0:
            break # Stop if there are no loans
            
        loans = json_normalize(data['loans'])
        
        # Take the loan id column from our retrieved loan data, and use it to pull additional details
        # about our loans. We store this in a seperate table loans_details
        loan_ids = loans['id'].tolist()
        loan_ids_str = ','.join(str(e) for e in loan_ids)

        r = requests.get('https://api.kivaws.org/v1/loans/'+loan_ids_str+'.json')
        ld_data = json.loads(r.text)
        loans_ext = json_normalize(ld_data['loans'])

        # Join on two dataframes
        temp = pd.merge(loans, 
                 loans_ext, 
                 how='inner', 
                 on='id', 
                 sort=True,
                 left_on=None, right_on=None, left_index=False, right_index=False, 
                 suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
        loans_complete = loans_complete.append(temp)
         
#         print("The number of loans on current page: {}".format(len(loans_full.index)))
        time.sleep(1) # Wait a little while to so we don't overload Kiva servers

Requesting…
Country: BJ; Page: 1
Country: BW; Page: 1


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Country: BF; Page: 1
Country: BI; Page: 1
Country: CM; Page: 1
Country: CG; Page: 1
Country: CD; Page: 1
Country: EG; Page: 1
Country: GH; Page: 1
Country: CI; Page: 1
Country: KE; Page: 1
Country: LS; Page: 1
Country: LR; Page: 1
Country: MG; Page: 1
Country: MW; Page: 1
Country: ML; Page: 1
Country: MR; Page: 1
Country: MZ; Page: 1
Country: NA; Page: 1
Country: NG; Page: 1
Country: RW; Page: 1
Country: SN; Page: 1
Country: SL; Page: 1
Country: SO; Page: 1
Country: ZA; Page: 1
Country: SS; Page: 1
Country: TZ; Page: 1
Country: TG; Page: 1
Country: UG; Page: 1
Country: CD; Page: 1
Country: ZM; Page: 1
Too many requests. Slow down 🐢


KeyError: 'paging'

In [104]:
loans_complete.columns = [c.replace('.', '_') for c in loans_complete.columns]

In [105]:
loans_complete.shape

(282, 79)

In [106]:
loans_complete.tail(n=2)

Unnamed: 0,activity_x,activity_y,basket_amount_x,basket_amount_y,bonus_credit_eligibility_x,bonus_credit_eligibility_y,borrower_count,borrowers,description_languages_x,description_languages_y,...,use_x,use_y,video_id_x,video_id_y,video_thumbnailImageId_x,video_thumbnailImageId_y,video_title_x,video_title_y,video_youtubeId_x,video_youtubeId_y
8,Retail,Retail,0.0,0.0,False,False,20,"[{'first_name': ' Venantia', 'last_name': '', ...","[fr, en]","[fr, en]",...,to buy and transport 8 pigs to strengthen her ...,to buy and transport 8 pigs to strengthen her ...,,,,,,,,
9,Textiles,Textiles,0.0,0.0,False,False,26,"[{'first_name': ' Wema', 'last_name': '', 'gen...","[fr, en]","[fr, en]",...,"to stock up with sewing materials (cloth, thre...","to stock up with sewing materials (cloth, thre...",,,,,,,,


In [107]:
cols = loans_complete.columns.tolist()
cols

['activity_x',
 'activity_y',
 'basket_amount_x',
 'basket_amount_y',
 'bonus_credit_eligibility_x',
 'bonus_credit_eligibility_y',
 'borrower_count',
 'borrowers',
 'description_languages_x',
 'description_languages_y',
 'description_texts_en',
 'description_texts_fr',
 'description_texts_pt',
 'funded_amount_x',
 'funded_amount_y',
 'funded_date',
 'id',
 'image_id_x',
 'image_id_y',
 'image_template_id_x',
 'image_template_id_y',
 'journal_totals_bulkEntries',
 'journal_totals_entries',
 'lender_count_x',
 'lender_count_y',
 'loan_amount_x',
 'loan_amount_y',
 'location_country_code_x',
 'location_country_code_y',
 'location_country_x',
 'location_country_y',
 'location_geo_level_x',
 'location_geo_level_y',
 'location_geo_pairs_x',
 'location_geo_pairs_y',
 'location_geo_type_x',
 'location_geo_type_y',
 'location_town_x',
 'location_town_y',
 'name_x',
 'name_y',
 'partner_id_x',
 'partner_id_y',
 'payments',
 'planned_expiration_date_x',
 'planned_expiration_date_y',
 'posted_dat

In [111]:
loans_complete.sector_x.tail(n=2)

8    Retail
9      Arts
Name: sector_x, dtype: object

In [112]:
# Select and rename columns 
#TODO: 
selected_cols = ['id', 
                 'loan_amount_x', 
                 'lender_count_x',
                 'status_x',
                 'funded_date',
                 'funded_amount_x',
                 'terms_repayment_term',
                 'location_country_code_x',
                 'sector_x',  
                 'description_texts_en',
                 'use_x',

                ]
loans_select = loans_complete[selected_cols]
loans_select = loans_select.rename(columns={'id': 'id_number', 
                                            'sector_x': 'sector',
                                            'status_x': 'status',
                                           'loan_amount_x': 'loan_amount',
                                            'lender_count_x': 'lender_count',
                                           'location_country_code_x': 'location_country_code',
                                           'use_x': 'use',
                                            'funded_amount_x': 'funded_amount',
                                            'loan_amount_x': 'loan_amount',
                                            'terms_repayment_term': 'repayment_term'
                                           })


loans_select.tail(n=2)

Unnamed: 0,id_number,loan_amount,lender_count,status,funded_date,funded_amount,repayment_term,location_country_code,sector,description_texts_en,use
8,1567020,5250,2,fundraising,,50,6,CD,Retail,Aimercianne has been the representative of the...,to buy and transport 8 pigs to strengthen her ...
9,1567122,6200,4,fundraising,,100,6,CD,Arts,"Wema is an entrepreneurial woman, owner of a s...","to stock up with sewing materials (cloth, thre..."


In [116]:
loans_select.columns.tolist()

['id_number',
 'loan_amount',
 'lender_count',
 'status',
 'funded_date',
 'funded_amount',
 'repayment_term',
 'location_country_code',
 'sector',
 'description_texts_en',
 'use']

-----

Let's save our data!

In [113]:
path = './'
loans_select.to_csv(path+'loans.csv',
                   index=False)

Not used
-----

We will also pull data about Kiva's partners, who are the intermediaries between Kiva and end recipients for the majority of loans made.

In [None]:
# data_path = '~/intro_to_machine_learning/data'
# df=pd.read_csv(data_path+'/loans_details.csv', low_memory=False)
# df.head()

In [None]:
# df=df[df['partner_id'].notnull()]
# df['partner_id']=df['partner_id'].astype(int).astype(str)
# partner_id=df['partner_id'].unique().tolist()

In [None]:
# d = r.get('https://api.kivaws.org/v1/partners/322.json?app_id=org.deltanalytics')

In [None]:
# d.headers

In [None]:
# d.json();

In [None]:
# def extract_loan_partners(partner_ids):
#     loan_partner_details=pd.DataFrame()
#     for n in partner_ids:
#         d = r.get('https://api.kivaws.org/v1/partners/'+n+'.json?app_id=org.deltanalytics')
#         data = json.loads(d.text)
#         partners=json_normalize(data['partners'])
#         loan_partner_details=loan_partner_details.append(partners, ignore_index=True)
    
#     return loan_partner_details  

In [None]:
# loan_partner_details=extract_loan_partners(partner_id)

In [None]:
# loan_partner_details.to_csv('~/intro_to_machine_learning/data/loans_partner_details.csv')

## Pulling each loan's lender details

Now that we have selected a subset of loans to focus in on, we want to pull all of the lenders of these loans. We do so here by first creating a list of the loans we are interested in. 

In [None]:
# # loan_ids = df['id'].astype(int).astype(str)
# loan_ids = loan_ids.unique().tolist()
# loan_ids[0:5]

Here we define a function that will extract details of every lender who has contributed to a single loan, and then loop it over the list of loans that we created in the previous step:

In [None]:
# def extract_loan_lenders(loan_ids):
#     loan_lenders_details=pd.DataFrame()
#     for n in loan_ids:
#         d = r.get('https://api.kivaws.org/v1/loans/'+n+'/lenders.json?app_id=org.deltanalytics')
#         data = json.loads(d.text)
#         if len(data['lenders']) != 0:
#             lenders=json_normalize(data['lenders'])
#         else: lenders = pd.DataFrame()
#         lenders['loan_id'] = n
#         loan_lenders_details = loan_lenders_details.append(lenders, ignore_index=True)

#     return loan_lenders_details  

In [None]:
# loan_lenders_details = extract_loan_lenders(loan_ids)

We then write the output to a data frame. Let's take a look at what we've got! 

In [None]:
# loan_lenders_details.head(100)

In [None]:
# loan_lenders_details.to_csv('~/intro_to_machine_learning/data/loans_lenders_details.csv')