# Foundations of Computer Science Project - Kiva Crowdfunding

In [1]:
import numpy as np, pandas as pd

### 1. Normalize the loan_lenders table. In the normalized table, each row must have one loan_id and one lender.

In [2]:
lenders = pd.read_csv('additional-kiva-snapshot/loans_lenders.csv', nrows=5)
lenders.head()

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."


In [3]:
#dataframe gets huge after splitting, declaring data types to optimize memory management
lenders = pd.read_csv('additional-kiva-snapshot/loans_lenders.csv', dtype={'loan_id': np.uint32})

loans = pd.Series(lenders['loan_id'])
#splitting each lenders list into separate names
lenders = lenders['lenders'].str.split(',').explode()
lenders = {'loan_id': loans, 'lender': lenders}
lenders = pd.DataFrame(lenders)

lenders.head()

Unnamed: 0,loan_id,lender
0,483693,muc888
0,483693,sam4326
0,483693,camaran3922
0,483693,lachheb1865
0,483693,rebecca3499


### 2. For each loan, add a column duration corresponding to the number of days between the disburse time and the planned expiration time. If any of those two dates is missing, also the duration must be missing.

In [4]:
loans = pd.read_csv('additional-kiva-snapshot/loans.csv', nrows=5)
loans.head()

Unnamed: 0,loan_id,loan_name,original_language,description,description_translated,funded_amount,loan_amount,status,activity_name,sector_name,...,raised_time,lender_term,num_lenders_total,num_journal_entries,num_bulk_entries,tags,borrower_genders,borrower_pictured,repayment_interval,distribution_model
0,657307,Aivy,English,"Aivy, 21 years of age, is single and lives in ...",,125.0,125.0,funded,General Store,Retail,...,2014-01-15 04:48:22.000 +0000,7.0,3,2,1,,female,True,irregular,field_partner
1,657259,Idalia Marizza,Spanish,"Doña Idalia, esta casada, tiene 57 años de eda...","Idalia, 57, is married and lives with her husb...",400.0,400.0,funded,Used Clothing,Clothing,...,2014-02-25 06:42:06.000 +0000,8.0,11,2,1,,female,True,monthly,field_partner
2,658010,Aasia,English,Aasia is a 45-year-old married lady and she ha...,,400.0,400.0,funded,General Store,Retail,...,2014-01-24 23:06:18.000 +0000,14.0,16,2,1,"#Woman Owned Biz, #Supporting Family, user_fav...",female,True,monthly,field_partner
3,659347,Gulmira,Russian,"Гулмире 36 лет, замужем, вместе с супругом вос...",Gulmira is 36 years old and married. She and ...,625.0,625.0,funded,Farming,Agriculture,...,2014-01-22 05:29:28.000 +0000,14.0,21,2,1,user_favorite,female,True,monthly,field_partner
4,656933,Ricky\t,English,Ricky is a farmer who currently cultivates his...,,425.0,425.0,funded,Farming,Agriculture,...,2014-01-14 17:29:27.000 +0000,7.0,15,2,1,"#Animals, #Eco-friendly, #Sustainable Ag",male,True,bullet,field_partner


In [5]:
#big dataset: unused attributes not imported and data types declared in order to save space
loans.columns

Index(['loan_id', 'loan_name', 'original_language', 'description',
       'description_translated', 'funded_amount', 'loan_amount', 'status',
       'activity_name', 'sector_name', 'loan_use', 'country_code',
       'country_name', 'town_name', 'currency_policy',
       'currency_exchange_coverage_rate', 'currency', 'partner_id',
       'posted_time', 'planned_expiration_time', 'disburse_time',
       'raised_time', 'lender_term', 'num_lenders_total',
       'num_journal_entries', 'num_bulk_entries', 'tags', 'borrower_genders',
       'borrower_pictured', 'repayment_interval', 'distribution_model'],
      dtype='object')

In [6]:
loans = pd.read_csv('additional-kiva-snapshot/loans.csv',
                    usecols=['loan_id', 'funded_amount', 'loan_amount', 'status', 'country_code', 'country_name', 
                             'planned_expiration_time', 'disburse_time', 'num_lenders_total', 'repayment_interval'],
                    dtype={'loan_id': np.uint32, 'funded_amount': np.uint32, 'loan_amount': np.uint32,
                           'status': 'category', 'country_code': 'category', 'country_name': 'category',
                          'planned_expiration_time': 'str', 'disburse_time': 'str',
                           'num_lenders_total': np.uint32, 'repayment_interval': 'category'})
loans.head()

Unnamed: 0,loan_id,funded_amount,loan_amount,status,country_code,country_name,planned_expiration_time,disburse_time,num_lenders_total,repayment_interval
0,657307,125,125,funded,PH,Philippines,2014-02-14 03:30:06.000 +0000,2013-12-22 08:00:00.000 +0000,3,irregular
1,657259,400,400,funded,HN,Honduras,2014-03-26 22:25:07.000 +0000,2013-12-20 08:00:00.000 +0000,11,monthly
2,658010,400,400,funded,PK,Pakistan,2014-02-15 21:10:05.000 +0000,2014-01-09 08:00:00.000 +0000,16,monthly
3,659347,625,625,funded,KG,Kyrgyzstan,2014-02-21 03:10:02.000 +0000,2014-01-17 08:00:00.000 +0000,21,monthly
4,656933,425,425,funded,PH,Philippines,2014-02-13 06:10:02.000 +0000,2013-12-17 08:00:00.000 +0000,15,bullet


In [7]:
loans['disburse_time'].isnull().value_counts()

False    1416794
True        2813
Name: disburse_time, dtype: int64

In [8]:
loans['planned_expiration_time'].isnull().value_counts()

False    1047773
True      371834
Name: planned_expiration_time, dtype: int64

In [9]:
loans['duration'] = (loans['planned_expiration_time'].astype('datetime64') - loans['disburse_time'].astype('datetime64'))
loans.head()

Unnamed: 0,loan_id,funded_amount,loan_amount,status,country_code,country_name,planned_expiration_time,disburse_time,num_lenders_total,repayment_interval,duration
0,657307,125,125,funded,PH,Philippines,2014-02-14 03:30:06.000 +0000,2013-12-22 08:00:00.000 +0000,3,irregular,53 days 19:30:06
1,657259,400,400,funded,HN,Honduras,2014-03-26 22:25:07.000 +0000,2013-12-20 08:00:00.000 +0000,11,monthly,96 days 14:25:07
2,658010,400,400,funded,PK,Pakistan,2014-02-15 21:10:05.000 +0000,2014-01-09 08:00:00.000 +0000,16,monthly,37 days 13:10:05
3,659347,625,625,funded,KG,Kyrgyzstan,2014-02-21 03:10:02.000 +0000,2014-01-17 08:00:00.000 +0000,21,monthly,34 days 19:10:02
4,656933,425,425,funded,PH,Philippines,2014-02-13 06:10:02.000 +0000,2013-12-17 08:00:00.000 +0000,15,bullet,57 days 22:10:02


### 3. Find the lenders that have funded at least twice.

In [10]:
lenders_count = lenders.groupby('lender').count()
lenders_count.query('loan_id > 1')

Unnamed: 0_level_0,loan_id
lender,Unnamed: 1_level_1
000,39
00000,39
0002,70
0101craign0101,71
0132575,4
...,...
zyrorl,3
zzaman,11
zzanita,2
zzmcfate,56


### 4. For each country, compute how many loans have involved that country as borrowers.

In [11]:
loans.groupby('country_name')['loan_id'].count()

country_name
Afghanistan     2337
Albania         3075
Armenia        13952
Azerbaijan     10172
Belize           218
               ...  
Canada             1
Botswana           1
Bhutan             2
Mauritania         1
Uruguay            1
Name: loan_id, Length: 96, dtype: int64

### 5. For each country, compute the overall amount of money borrowed.

In [12]:
loans.groupby('country_name')['loan_amount'].sum()

country_name
Afghanistan     1967950
Albania         4307350
Armenia        22950475
Azerbaijan     14784625
Belize           150175
                 ...   
Canada            50000
Botswana           8000
Bhutan            20000
Mauritania        15000
Uruguay            8000
Name: loan_amount, Length: 96, dtype: uint32

### 6. Like the previous point, but expressed as a percentage of the overall amount lent.

In [13]:
 loans.groupby('country_name')['funded_amount'].sum() / loans.groupby('country_name')['loan_amount'].sum()

country_name
Afghanistan    0.998323
Albania        0.926898
Armenia        0.883846
Azerbaijan     0.959872
Belize         1.000000
                 ...   
Canada         1.000000
Botswana       1.000000
Bhutan         0.781250
Mauritania     1.000000
Uruguay        1.000000
Length: 96, dtype: float64

### 7. Like the three previous points, but split for each year (with respect to disburse time).

In [14]:
loans['disburse_year'] = loans['disburse_time'].str.split('-', expand= True)[0]
loans.groupby(['country_name','disburse_year'])['funded_amount'].sum()

country_name  disburse_year
Afghanistan   2005                  NaN
              2006                  NaN
              2007             194975.0
              2008             365375.0
              2009             581825.0
                                 ...   
Uruguay       2014                  NaN
              2015                  NaN
              2016                  NaN
              2017                  NaN
              2018               8000.0
Name: funded_amount, Length: 1344, dtype: float64

### 8. For each lender, compute the overall amount of money lent. For each loan that has more than one lender, you must assume that all lenders contributed the same amount.

In [15]:
lenders = lenders.join(loans[['loan_id','loan_amount','num_lenders_total']].set_index('loan_id'), on='loan_id')
lenders.head()

Unnamed: 0,loan_id,lender,loan_amount,num_lenders_total
0,483693,muc888,1225.0,44.0
0,483693,sam4326,1225.0,44.0
0,483693,camaran3922,1225.0,44.0
0,483693,lachheb1865,1225.0,44.0
0,483693,rebecca3499,1225.0,44.0


In [16]:
lenders['quota']= lenders['loan_amount'] / lenders['num_lenders_total']

In [17]:
lenders.head()

Unnamed: 0,loan_id,lender,loan_amount,num_lenders_total,quota
0,483693,muc888,1225.0,44.0,27.840909
0,483693,sam4326,1225.0,44.0,27.840909
0,483693,camaran3922,1225.0,44.0,27.840909
0,483693,lachheb1865,1225.0,44.0,27.840909
0,483693,rebecca3499,1225.0,44.0,27.840909


In [19]:
lenders.groupby('lender')['quota'].sum()

lender
 000               1485.309656
 00000             1249.947363
 0002              2201.180463
 00mike00            38.461538
 0101craign0101    2424.088932
                      ...     
zzanita              62.500000
zzcyna7269           44.285714
zzinnia              32.758621
zzmcfate           2033.750197
zzrvmf8538          112.820513
Name: quota, Length: 1639026, dtype: float64

### 9. For each country, compute the difference between the overall amount of money lent and the overall amount of money borrowed. Since the country of the lender is often unknown, you can assume that the true distribution among the countries is the same as the one computed from the rows where the country is known.

In [None]:
df3 = pd.read_csv('additional-kiva-snapshot/lenders.csv')

In [None]:
df3.head()

In [None]:
df3.groupby('country_code').size()

In [None]:
df3.shape

In [None]:
df3['country_code'].isnull().sum()

In [None]:
s=df3.country_code.value_counts(normalize= True)

In [None]:
s

In [None]:
isnull = df3.country_code.isnull()
sample = df3.country_code.dropna().sample(isnull.sum(), replace=True).values
df3.loc[isnull, 'country_code'] = sample

In [None]:
df3.country_code.head()

In [None]:
df3.head()

In [None]:
df3.columns

In [None]:
df3.groupby('permanent_name').sum().head()

In [None]:
df3.loc[df3.permanent_name=='muc888']

In [None]:
df9= df8_1.groupby('Finanziatori').quota.sum()
df9.head()

In [None]:
df9= pd.DataFrame(df9)
df9.head()

In [None]:
df9=df9.reset_index()
df9.head()

In [None]:
df9=df9.rename(columns={"Finanziatori": "permanent_name", "quota": "prestito_tot"})

In [None]:
df9.head()

In [None]:
df91= df3[['permanent_name','country_code']]

In [None]:
df91.head()

In [None]:
df_hope= pd.merge(df9,df91)

In [None]:
df_hope.head()

In [None]:
df9end = df_hope.groupby('country_code').prestito_tot.sum()

In [None]:
df9end.head()

In [None]:
df9end1=df2.groupby('country_code')['loan_amount'].sum()

In [None]:
df9end = pd.DataFrame(df9end)
df9end.head()

In [None]:
df9end1= pd.DataFrame(df9end1)
df9end1.head()


In [None]:
df9end=df9end.reset_index()
df9end.head()

In [None]:
df9end1 =df9end1.reset_index()
df9end1.head()

In [None]:
df9END= df9end.join(df9end1[['country_code','loan_amount']].set_index('country_code'), on='country_code')

In [None]:
df9END.head()

In [None]:
df9END=df9END.fillna(0)

In [None]:
df9END['diff'] = df9END['prestito_tot'] - df9END['loan_amount']

In [None]:
df9END.head()

### 10. Which country has the highest ratio between the difference computed at the previous point and the population?

In [None]:
df10 = pd.read_csv('additional-kiva-snapshot/country_stats.csv')

In [None]:
df10.head()

In [None]:
df10_1 = df10[['country_code','population']]

In [None]:
df10_1.head()

In [None]:
df10_2=pd.merge(df10_1,df9END)

In [None]:
df10_2.head()

In [None]:
df10_2['punto10'] = df10_2['diff'] / df10_2['population']

In [None]:
df10_2.head()

In [None]:
df10_2.loc[df10_2['punto10'].idxmax(),['country_code', 'punto10']]

### 11. Which country has the highest ratio between the difference computed at point 9 and the population that is not below the poverty line?

In [None]:
df11_1= df10[['country_code','population','population_below_poverty_line',]]

In [None]:
df11_1.head()

In [None]:
df11_1.population_below_poverty_line.dtype

In [None]:
df11_1['punto11'] = df11_1['population'] * (1- df11_1['population_below_poverty_line'] / 100)

In [None]:
df11_1.head()

In [None]:
df11_2 = pd.merge(df11_1 , df10_2)

In [None]:
df11_2.head()

In [None]:
df11_2['ratio'] = df11_2['diff'] / df11_2['punto11']

In [None]:
df11_2.loc[df11_2['ratio'].idxmax(),['country_code','ratio']]

In [None]:
df11_2.loc[df11_2.country_code=='LU',:]

### 12. For each year, compute the total amount of loans. Each loan that has planned expiration time and disburse time in different years must have its amount distributed proportionally to the number of days in each year. For example, a loan with disburse time December 1st, 2016, planned expiration time January 30th 2018, and amount 5000USD has an amount of 5000USD * 31 / (31+365+30) = 363.85 for 2016, 5000USD * 365 / (31+365+30) = 4284.04 for 2017, and 5000USD * 30 / (31+365+30) = 352.11 for 2018.

In [None]:
def day_calc(date1, date2):
    (begin,end) = (date1,date2) if date1<date2 else (date2,date1)
    date_list = list()
    date_list.append(begin)
    for y in range(begin.year+1, end.year+1):
        date_list.append(pd.Timestamp(year=y, month=1, day=1, tz='utc'))
    date_list.append(end)
    
    res = dict()
    for i in range(len(date_list)-1):
        res[date_list[i].year] = (date_list[i+1] - date_list[i]).days

    return pd.Series(res, name='DateValue')



In [None]:
df12= pd.read_csv('additional-kiva-snapshot/loans.csv')

In [None]:
df12['planned_expiration_time'] = pd.to_datetime(df12['planned_expiration_time'])
df12['disburse_time'] = pd.to_datetime(df12['disburse_time'])
df12['duration'] = (df12['planned_expiration_time'] - df12['disburse_time']).apply(lambda x: x.days)


In [None]:
data_loans = df12.dropna()
df_day_weighted = data_loans.apply(lambda x: day_calc(x['planned_expiration_time'],x['disburse_time']), axis = 1)
df_day_weighted.head()

In [None]:
df_day_weighted.fillna(0, inplace= True)
df_day_weighted = pd.concat([df_day_weighted, data_loans['loan_id']
                 , data_loans['loan_amount']
                 , data_loans['duration'] ]
                , axis = 1, sort = True)

df_day_weighted.head()

In [None]:
final = df_day_weighted[[2011,2012,2013,2014,2015,2016,2017,2018
                        ]].div(df_day_weighted.duration, axis=0).mul(df_day_weighted.loan_amount, axis = 0)

final = pd.concat([df_day_weighted['loan_id'], final],  axis = 1 )
final = final.melt(id_vars=["loan_id"], 
        var_name="Year", 
        value_name="amount_per_year")

final.groupby(['loan_id', 'Year']).sum()

### 13. For each value of repayment_interval, add a new column to the lenders dataframe that contains the total amount of money corresponding to loans in such state

In [None]:
df13 = df2.groupby('country_code')['repayment_interval'].sum()

In [None]:
df2.repayment_interval.head()

In [None]:
df3.columns

In [None]:
df2.columns

In [None]:
def attach_totals_to_lenders():

    # {'country_code' := NaN} è considerato uno stato

    sum_by_country = df2.groupby('country_code', as_index=False).sum()[['country_code', 'funded_amount']]

    to_cross = df3

    return pd.merge(to_cross, sum_by_country, left_on='country_code', right_on='country_code')

In [None]:
out_13 = attach_totals_to_lenders()

display(out_13)

In [None]:
df13 = df2.groupby('country_code', as_index=False).sum()[['country_code', 'funded_amount']]

In [None]:
pd.merge(df13,df3, left_on='country_code', right_on='country_code')

### 14. What is the occupation with the highest average amount of money lent (the average must be computed over all lenders with a given occupation)?

In [None]:
df3.occupation.head(50)

In [None]:
df14_1=df3[['occupation','permanent_name']]

In [None]:
df14_1.head()

In [None]:
df14=df14_1.dropna()

In [None]:
df14.head()

In [None]:
df8_1.head()

In [None]:
df14_3=pd.DataFrame(df8_1.groupby('Finanziatori').quota.sum())

In [None]:
df14_3.head()

In [None]:
df14_3=df14_3.reset_index()

In [None]:
df14_3.head()

In [None]:
df14 = df14.rename(columns={"occupation": "occupation", "permanent_name": "Finanziatori"})

In [None]:
df14.head()

In [None]:
df14_mix= df14.join(df14_3[['Finanziatori','quota']].set_index('Finanziatori'), on='Finanziatori')

In [None]:
df14_mix.head(50)

In [None]:
df14_mix.groupby('occupation')['quota'].sum().idxmax()


In [None]:
df14_mix.groupby('occupation').sum()['quota'].idxmax()

### 15. Cluster the loans according to the year-month of disburse time.

In [None]:
df2[['funded_amount','disburse_time']].head(30)

In [None]:
df2.disburse_time.str.split('-', expand=True)

In [None]:
df2['punto7'] = df2.disburse_time.str.split('-', expand= True)[0]

In [None]:
df2['punto15'] = df2.disburse_time.str.split('-', expand= True)[1]

In [None]:
df2.groupby(['punto7','punto15'])['funded_amount'].sum()

### 16. For each country, compute its overall GNI, by multiplying the per capita GNI with its population.

In [None]:
df10.head()

In [None]:
df10['overallGNI']= df10['population'] * df10['gni']

In [None]:
df10.overallGNI.head(20)

### 17. Find the country with the highest rate of irregular repayment interval.

### 18. Find the country with the highest fraction of loaned amount with irregular repayment interval.