Get Kiva data
=====

In [81]:
reset -fs

In [82]:
import os
import json
import logging
import requests
import time
from urllib.request import urlopen, Request

from pandas.io.json import json_normalize
import pandas as pd

In [83]:
# # # Code for understanding KIVA api
# for code in [None]:
#     print(code)
#     url = f'http://api.kivaws.org/v1/loans/search.json?country_code={code}&per_page=5'
#     r = requests.get(url)
#     # r.status_code
#     print(r.json()['loans'][0])
#     break


# # Find number of pages
# url = f'http://api.kivaws.org/v1/loans/search.json?country_code={country_iso_code}'
# r = requests.get(url)
# data = json.loads(r.text)
# data

In [84]:
# Africian contry codes
country_codes_without_data = ["DZ", "AO", "SH", "CV", "CF", "TD", "KM", "DJ","GQ", "ER", "SZ",  "ET",  "GA", "GM", "GN",  "GW",    "LY",  "MU", "YT", "MA",  "NE",  "ST",  "RE", "ST",   "SC",  "SH", "SD","SZ","TN",  ]

country_codes_with_data = {"BJ", "BW", "BF", "BI", "CM", "CG", "CD", "EG", 
                           "GH", "CI", "KE", "LS", "LR", "MG", "MW", "ML", 
                           "MR", "MZ", "NA", "NG", "RW", "SN", "SL", "SO",
                           "ZA", "SS", "TZ", "TG", "UG", "ZM", "ZW"}

In [85]:
loans_complete = pd.DataFrame()
print("Requesting…")

for country_iso_code in country_codes_with_data: 
    print(f"Country: {country_iso_code}")
    n_pages = 10
    for page in range(1, n_pages+1):
        print(f"\t Page: {page}")
        url = f'http://api.kivaws.org/v1/loans/search.json?country_code={country_iso_code}&page={page}'
        r = requests.get(url)
        if r.status_code == 403:
            raise Exception("Too many requests. Slow down 🐢")
        
        data = json.loads(r.text)
        if (data['paging']['total'] == 0) or (not data['loans']):
            break # Stop if there are no loans
            
        loans = json_normalize(data['loans'])
        
        # Take the loan id column from our retrieved loan data
        # Use it to pull additional details about our loans.
        loan_ids = loans['id'].tolist()
        loan_ids_str = ','.join(str(e) for e in loan_ids)

        r = requests.get('https://api.kivaws.org/v1/loans/'+loan_ids_str+'.json')
        ld_data = json.loads(r.text)
        loans_ext = json_normalize(ld_data['loans'])

        # Join on two dataframes
        temp = pd.merge(loans, 
                         loans_ext, 
                         how='inner', 
                         on='id', 
                         sort=False,
                         left_on=None, right_on=None, left_index=False, right_index=False, 
                         suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
        loans_complete = loans_complete.append(temp)
        
        time.sleep(1.2) # Wait a little while to so we don't overload Kiva servers

Requesting…
Country: BJ
	 Page: 1
	 Page: 2


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: BW
	 Page: 1
	 Page: 2
Country: BF
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: BI
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: CM
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: CG
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: CD
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: EG
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: GH
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Page: 9
	 Page: 10
Country: CI
	 Page: 1
	 Page: 2
	 Page: 3
	 Page: 4
	 Page: 5
	 Page: 6
	 Page: 7
	 Page: 8
	 Pa

In [90]:
# Rename columns
loans_complete.columns = [c.replace('.', '_') for c in loans_complete.columns]

In [91]:
loans_complete.shape

(6019, 82)

In [92]:
loans_complete.tail(n=2)

Unnamed: 0,activity_x,activity_y,basket_amount_x,basket_amount_y,bonus_credit_eligibility_x,bonus_credit_eligibility_y,borrower_count,borrowers,currency_exchange_loss_amount_x,currency_exchange_loss_amount_y,...,use_x,use_y,video_id_x,video_id_y,video_thumbnailImageId_x,video_thumbnailImageId_y,video_title_x,video_title_y,video_youtubeId_x,video_youtubeId_y
18,Grocery Store,Grocery Store,,,True,True,1,"[{'first_name': 'Jacqueline ', 'last_name': ''...",,,...,her to buy goods to sell in her store.,her to buy goods to sell in her store.,,,,,,,,
19,Grocery Store,Grocery Store,,,True,True,1,"[{'first_name': 'Delligent ', 'last_name': '',...",,,...,to buy grocery goods for her business.,to buy grocery goods for her business.,,,,,,,,


In [93]:
loans_complete.columns.tolist()

['activity_x',
 'activity_y',
 'basket_amount_x',
 'basket_amount_y',
 'bonus_credit_eligibility_x',
 'bonus_credit_eligibility_y',
 'borrower_count',
 'borrowers',
 'currency_exchange_loss_amount_x',
 'currency_exchange_loss_amount_y',
 'description_languages_x',
 'description_languages_y',
 'description_texts_en',
 'description_texts_es',
 'description_texts_fr',
 'description_texts_pt',
 'funded_amount_x',
 'funded_amount_y',
 'funded_date',
 'id',
 'image_id_x',
 'image_id_y',
 'image_template_id_x',
 'image_template_id_y',
 'journal_totals_bulkEntries',
 'journal_totals_entries',
 'lender_count_x',
 'lender_count_y',
 'loan_amount_x',
 'loan_amount_y',
 'location_country_code_x',
 'location_country_code_y',
 'location_country_x',
 'location_country_y',
 'location_geo_level_x',
 'location_geo_level_y',
 'location_geo_pairs_x',
 'location_geo_pairs_y',
 'location_geo_type_x',
 'location_geo_type_y',
 'location_town_x',
 'location_town_y',
 'name_x',
 'name_y',
 'partner_id_x',
 'par

In [98]:
# Select and reorder columns
selected_cols = ['id', 
                 'loan_amount_x', 
                 'lender_count_x',
                 'status_x',
                 'funded_date',
                 'funded_amount_x',
                 'terms_repayment_term',
                 'location_country_code_x',
                 'sector_x',  
                 'description_texts_en',
                 'use_x',

                ]
loans_select = loans_complete[selected_cols]

# Rename columns
loans_select = loans_select.rename(columns={'id':                'id_number', 
                                            'sector_x':          'sector',
                                            'status_x':          'status',
                                            'loan_amount_x':     'loan_amount',
                                            'lender_count_x':    'lender_count',
                                            'use_x':             'use',
                                            'funded_amount_x':   'funded_amount',
                                            'loan_amount_x':     'loan_amount',
                                            'terms_repayment_term':    'repayment_term',
                                            'location_country_code_x': 'location_country_code',
                                            'description_texts_en':    'description'
                                           })


loans_select.tail(n=2)

Unnamed: 0,id_number,loan_amount,lender_count,status,funded_date,funded_amount,repayment_term,location_country_code,sector,description,use
18,1568887,200,8,funded,2018-07-18T23:38:44Z,200,14,ZW,Food,Jacqueline is a 23-year-old entrepreneur who l...,her to buy goods to sell in her store.
19,1568890,200,8,funded,2018-07-19T16:54:18Z,200,14,ZW,Food,Delligent is a 23-year-old entrepreneur who li...,to buy grocery goods for her business.


In [99]:
loans_select.columns.tolist()

['id_number',
 'loan_amount',
 'lender_count',
 'status',
 'funded_date',
 'funded_amount',
 'repayment_term',
 'location_country_code',
 'sector',
 'description',
 'use']

-----

Let's save our data!

In [100]:
path = './'
loans_select.to_csv(path+'loans.csv',
                   index=False)

Not used
-----

We will also pull data about Kiva's partners, who are the intermediaries between Kiva and end recipients for the majority of loans made.

In [None]:
# data_path = '~/intro_to_machine_learning/data'
# df=pd.read_csv(data_path+'/loans_details.csv', low_memory=False)
# df.head()

In [None]:
# df=df[df['partner_id'].notnull()]
# df['partner_id']=df['partner_id'].astype(int).astype(str)
# partner_id=df['partner_id'].unique().tolist()

In [None]:
# d = r.get('https://api.kivaws.org/v1/partners/322.json?app_id=org.deltanalytics')

In [None]:
# d.headers

In [None]:
# d.json();

In [None]:
# def extract_loan_partners(partner_ids):
#     loan_partner_details=pd.DataFrame()
#     for n in partner_ids:
#         d = r.get('https://api.kivaws.org/v1/partners/'+n+'.json?app_id=org.deltanalytics')
#         data = json.loads(d.text)
#         partners=json_normalize(data['partners'])
#         loan_partner_details=loan_partner_details.append(partners, ignore_index=True)
    
#     return loan_partner_details  

In [None]:
# loan_partner_details=extract_loan_partners(partner_id)

In [None]:
# loan_partner_details.to_csv('~/intro_to_machine_learning/data/loans_partner_details.csv')

## Pulling each loan's lender details

Now that we have selected a subset of loans to focus in on, we want to pull all of the lenders of these loans. We do so here by first creating a list of the loans we are interested in. 

In [None]:
# # loan_ids = df['id'].astype(int).astype(str)
# loan_ids = loan_ids.unique().tolist()
# loan_ids[0:5]

Here we define a function that will extract details of every lender who has contributed to a single loan, and then loop it over the list of loans that we created in the previous step:

In [None]:
# def extract_loan_lenders(loan_ids):
#     loan_lenders_details=pd.DataFrame()
#     for n in loan_ids:
#         d = r.get('https://api.kivaws.org/v1/loans/'+n+'/lenders.json?app_id=org.deltanalytics')
#         data = json.loads(d.text)
#         if len(data['lenders']) != 0:
#             lenders=json_normalize(data['lenders'])
#         else: lenders = pd.DataFrame()
#         lenders['loan_id'] = n
#         loan_lenders_details = loan_lenders_details.append(lenders, ignore_index=True)

#     return loan_lenders_details  

In [None]:
# loan_lenders_details = extract_loan_lenders(loan_ids)

We then write the output to a data frame. Let's take a look at what we've got! 

In [None]:
# loan_lenders_details.head(100)

In [None]:
# loan_lenders_details.to_csv('~/intro_to_machine_learning/data/loans_lenders_details.csv')