# Sample NLP model

- Important insight: The dataframe 'df_loans2' is probably extracted in January 2018 so there are 3608 loans with the status = 'fundRaising'. These loans have a posted time which is less than 30 days so we can't say if these loans will be fully funded or expire. Better to remove this category and only work with the categories 'funded' and 'expired'.
- create a sample with 2 bins based on "status" column:
    - bin 1: funded 
    - bin 2: expired
- A text classifier can be used to determine which words are the most important for each bin.

In [None]:
# https://towardsdatascience.com/text-classification-in-python-dd95d264c802

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
df_loans_complete= pd.read_csv(r"C:\Users\peter\Documents\2e master HIR\Thesis Kiva\loans.csv",index_col=None)

In [3]:
pd.options.display.max_colwidth = 100

In [4]:
df_loans_complete.dtypes

loan_id                              int64
loan_name                           object
original_language                   object
description                         object
description_translated              object
funded_amount                      float64
loan_amount                        float64
status                              object
activity_name                       object
sector_name                         object
loan_use                            object
country_code                        object
country_name                        object
town_name                           object
currency_policy                     object
currency_exchange_coverage_rate    float64
currency                            object
partner_id                         float64
posted_time                         object
planned_expiration_time             object
disburse_time                       object
raised_time                         object
lender_term                        float64
num_lenders

- Make two dataframes: one for the funded loans and one for the expired loans
- Create the variable description_ENG by replacing the descriptions in a foreign language by their translations

In [5]:
df_funded=df_loans_complete[['loan_id','loan_name','distribution_model','posted_time','description','description_translated','funded_amount','loan_amount','borrower_genders','status']][df_loans_complete["status"]=='funded']

df_expired=df_loans_complete[['loan_id','loan_name','distribution_model','posted_time','description','description_translated','funded_amount','loan_amount','borrower_genders','status']][df_loans_complete["status"]=='expired']

print('Number of loans which are funded:')
print(len(df_funded))
print('Number of loans which are expired:')
print(len(df_expired))

Number of loans which are funded:
1350340
Number of loans which are expired:
59081


In [6]:
# create column with english descriptions
df_funded['description_ENG']=df_funded['description_translated'].combine_first(df_funded['description'])
print('Number of descriptions with NaNs which are removed (funded):')
print(df_funded['description_ENG'].isna().sum())
df_funded = df_funded[df_funded['description_ENG'].notna()]


Number of descriptions with NaNs which are removed (funded):
36695


In [7]:
# create column with english descriptions
df_expired['description_ENG']=df_expired['description_translated'].combine_first(df_expired['description'])
print('Number of descriptions with NaNs which are removed (expired):')
print(df_expired['description_ENG'].isna().sum())
df_expired = df_expired[df_expired['description_ENG'].notna()]

Number of descriptions with NaNs which are removed (expired):
12


In [8]:
df_funded['posted_time'] = pd.to_datetime(df_funded['posted_time'])

df_funded['year'], df_funded['month'] = df_funded['posted_time'].dt.year, df_funded['posted_time'].dt.month

In [9]:
df_expired['posted_time'] = pd.to_datetime(df_expired['posted_time'])

df_expired['year'], df_expired['month'] = df_expired['posted_time'].dt.year, df_expired['posted_time'].dt.month

In [10]:
df_funded[['loan_id','loan_name','posted_time','description_ENG','distribution_model','month','year','loan_amount','funded_amount','borrower_genders','status']].to_csv(r"C:\Users\peter\Documents\2e master HIR\Thesis Kiva\df_funded2.csv", index=False)
df_expired[['loan_id','loan_name','posted_time','description_ENG','distribution_model','month','year','loan_amount','funded_amount','borrower_genders','status']].to_csv(r"C:\Users\peter\Documents\2e master HIR\Thesis Kiva\df_expired2.csv", index=False)

In [11]:
#df_funded= pd.read_csv(r"C:\Users\peter\Documents\2e master HIR\Thesis Kiva\df_funded2.csv",index_col=None)
#df_expired= pd.read_csv(r"C:\Users\peter\Documents\2e master HIR\Thesis Kiva\df_expired2.csv",index_col=None)

Check number of loans which have direct distribution model for each dataset.

In [12]:
df_funded['distribution_model'].value_counts()

field_partner    1299633
direct             14012
Name: distribution_model, dtype: int64

   - Remove the years 2011 & 2018 and the direct loans
   - Randomly sample 10 000 loans from the expired dataset and 10 000 loans from the funded dataset
   - We did not just lower the majority class because this sample would require too much computational time.

In [13]:
df_funded_field_12_17=df_funded[(df_funded['year']>=2012) & (df_funded['year']<=2017) & (df_funded['distribution_model']=='field_partner')]
df_expired_field_12_17=df_expired[(df_expired['year']>=2012) & (df_expired['year']<=2017) & (df_expired['distribution_model']=='field_partner')]
df_funded_direct_12_17=df_funded[(df_funded['year']>=2012) & (df_funded['year']<=2017) & (df_funded['distribution_model']=='direct')]
df_expired_direct_12_17=df_expired[(df_expired['year']>=2012) & (df_expired['year']<=2017) & (df_expired['distribution_model']=='direct')]

In [14]:
count=10000
df_expired_field_12_17_sample = df_expired_field_12_17.sample(count)
df_funded_field_12_17_sample = df_funded_field_12_17.sample(count)

df_loans_complete_under_sampled = pd.concat([df_expired_field_12_17_sample,
                                   df_funded_field_12_17_sample], axis=0)

print('Random under-sampling:')
print(df_loans_complete_under_sampled.status.value_counts())


Random under-sampling:
expired    10000
funded     10000
Name: status, dtype: int64


   - Create the feature gender_reclassified by reducing the lists of genders of the original feature borrower genders to one gender.
   - For this purpose, the feature values for the individual loans remained the same, whereas the gender of the group leader was used for the group loans.
   - The group leader was determined by using the descriptions.

In [15]:
df_loans_complete_under_sampled['borrower_count'] = df_loans_complete_under_sampled['borrower_genders'].str.split().str.len()
df_loans_complete_under_sampled['loan_type'] = np.where(df_loans_complete_under_sampled['borrower_count']<= 1, 'individual', 'group')

df = df_loans_complete_under_sampled[df_loans_complete_under_sampled['loan_type'] == 'group'].copy()
df['description_ENG_Parsed'] = df['description_ENG']

# Convert to lowercase
df['description_ENG_Parsed'] = df['description_ENG_Parsed'].str.lower()

#tokenize
import nltk
from nltk.tokenize import word_tokenize
df['description_ENG_Parsed'] = df.apply(lambda row: nltk.word_tokenize(row['description_ENG_Parsed']), axis=1)
tokenized_list = df['description_ENG_Parsed'].tolist()

female_words = ['she',"she's",'her','hers','herself']
male_words = ['he',"he's",'him','his','himself']

female_count_list = []
male_count_list = []


for sentence in tokenized_list:
    female_count = 0
    male_count = 0
    for word in sentence:
        if word in male_words:
            male_count = male_count + 1
        elif word in female_words:
            female_count = female_count + 1
        else:
            continue
    male_count_list.append(male_count)
    female_count_list.append(female_count)
    
df['male counter'] = male_count_list
df['female counter'] = female_count_list
df['gender_reclassified'] = np.where(df['male counter']< df['female counter'], 'female', 'male')

df_loans_complete_under_sampled = pd.merge(df_loans_complete_under_sampled, df[['loan_name','description_ENG','male counter','female counter','gender_reclassified']], on=['loan_name','description_ENG'], how='outer')
df_loans_complete_under_sampled['gender_reclassified']=df_loans_complete_under_sampled['gender_reclassified'].combine_first(df_loans_complete_under_sampled['borrower_genders'])

In [16]:
len(df[df['male counter'] == df['female counter']])

6

In [17]:
df_loans_complete_under_sampled.groupby(['loan_type', 'gender_reclassified']).size()

loan_type   gender_reclassified
group       female                  2184
            male                     670
individual  female                 10263
            male                    6883
dtype: int64

In [18]:
df_loans_complete_under_sampled['gender_reclassified'].value_counts()

female    12447
male       7553
Name: gender_reclassified, dtype: int64

In [19]:
df_loans_complete_under_sampled.isnull().sum()

loan_id                    0
loan_name                 93
posted_time                0
description_ENG            0
distribution_model         0
month                      0
year                       0
loan_amount                0
funded_amount              0
borrower_genders           0
status                     0
borrower_count             0
loan_type                  0
male counter           17146
female counter         17146
gender_reclassified        0
dtype: int64

In [20]:
df_loans_complete_under_sampled_subset = df_loans_complete_under_sampled[['loan_id','description_ENG','status','year','month','funded_amount','loan_amount','loan_name','gender_reclassified','borrower_count','loan_type']]

In [21]:
df_loans_complete_under_sampled_subset.to_csv(r"C:\Users\peter\Documents\2e master HIR\Thesis Kiva\sample_tfidf_corrected.csv", index=False)