In [None]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pylab
import seaborn as sns


plt.style.use('fivethirtyeight')
%matplotlib inline
pylab.rcParams['figure.figsize'] = (15.0, 8.0)

In [None]:
loans_df = pd.read_csv("../input/kiva_loans.csv")

In [None]:
loans_df.head()

## Number of loans by country
- Since Kiva extends services to financially excluded people around the globe, it makes sense that the countries where the most loans are given out are developing nations like the Phillipines and Kenya

In [None]:
loans_df.groupby(loans_df.country).id.count().sort_values().plot.bar();
plt.title("Loan Count by Country");

## Most popular sectors in which loans are taken
- Agriculture, food and retail are the most popular sectors in which loans are taken

In [None]:
loans_df.groupby(loans_df.sector).id.count().sort_values().plot.bar();
plt.title("Loan Count by Sector");

## Distribution of Loan duration
Most loans are short term loans of less than 24 months(two years

In [None]:
pylab.rcParams['figure.figsize'] = (8.0, 8.0)
loans_df.term_in_months.plot.hist(bins=100);
plt.title("Loan Count by Loan Duration");

## Distribution of number of lenders
- Most loans have between 1 to 150 lenders with some outliers having large numbers of lenders with the maximum number of lenders being 2986

In [None]:
loans_df.lender_count.plot.box();
plt.title("Distribution of Number of Lenders per loan");

In [None]:
axes = plt.gca()
axes.set_xlim([0,500])
loans_df.lender_count.plot.hist(bins=1000);
plt.title("Distribution of Number of Lenders where number < 500");

In [None]:
max(loans_df.lender_count)

## Gender of borrowers
- Female only borrowers(single or group) are significantly more than male only borrowers and mixed groups

In [None]:
def process_gender(x):
    
    if type(x) is float and np.isnan(x):
        return "nan"
    genders = x.split(",")
    male_count = sum(g.strip() == 'male' for g in genders)
    female_count = sum(g.strip() == 'female' for g in genders)
    
    if(male_count > 0 and female_count > 0):
        return "MF"
    elif(female_count > 0):
        return "F"
    elif (male_count > 0):
        return "M"

In [None]:
loans_df.borrower_genders = loans_df.borrower_genders.apply(process_gender)

In [None]:
loans_df.borrower_genders.value_counts().plot.bar();
plt.title("Loan Count by Gender of Borrower");

## Distribution of Loan Amount
- We will consider the funded_amount variable as this is the amount which is disbursed to the borrower by the field agent
- As all amounts are in USD, no currency conversion is required
- Most of the values are below $2000, with 8\% of all loans lying above this value

In [None]:
loans_df.funded_amount.plot.box();
plt.title("Distribution of Loan Funded Amount");

In [None]:
# Q3 + 1.5 * IQR
IQR = loans_df.funded_amount.quantile(0.75) - loans_df.funded_amount.quantile(0.25)
upper_whisker = loans_df.funded_amount.quantile(0.75) + 1.5 * IQR
loans_above_upper_whisker = loans_df[loans_df.funded_amount > upper_whisker]
loans_above_upper_whisker.shape

In [None]:
# percentage of loans above upper whisker
loans_above_upper_whisker.shape[0]/loans_df.shape[0]

### Analysis of loan amount below $2000
- The distribution is skewed to the right with higher loan amounts being less common

In [None]:
loans_below_upper_whisker = loans_df[loans_df.funded_amount < upper_whisker]

In [None]:
loans_below_upper_whisker.funded_amount.plot.hist();
plt.title("Distribution of Loan Funded amount < $2000");

### Analysis of loan amount  \$2,000 - \$20,000
- Most of the outliers lie in this range

In [None]:
df = loans_above_upper_whisker[loans_above_upper_whisker.funded_amount < 20000]
df.funded_amount.plot.hist();
plt.title("Distribution of Loan Funded Amount between \$2,000 and \$20,000");
df.shape

## Analysis of loan amount \$20,000 to \$60,000
- A few values lie in this range
- Most of the high value loans are disbursed for Agriculture and Retail

In [None]:
df = loans_above_upper_whisker[(loans_above_upper_whisker.funded_amount > 20000) & (loans_above_upper_whisker.funded_amount < 60000)]
df.funded_amount.plot.hist()
plt.title("Distribution of Loan Funded Amount between \$20,000 and \$60,000");
df.shape

In [None]:
df.sector.value_counts().sort_values().plot.bar();
plt.title("Loan Count by Sector for Loan Amount between \$20,000 and \$60,000");

### Loan amount above \$60,000
- There is only a single loan amount with a value of \$100,000 in this range distributed for Agriculture in Haiti


In [None]:
loans_df[loans_df.funded_amount > 60000]