# Import data

## Imports:

In [1]:
import os
import numpy as np
import pandas as pd
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_non_alphanum
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

#!python -m spacy download en_core_web_md
import spacy
from spacy.lang.en import English
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

## Get the data 

 - The data consists of two samples each having 10000 instances. 
 - The first sample are the funded loans (bin 1) and the second sample are the expired loans (bin 2). 
 - The loans are made between 2012 and 2017 and are distributed with a field partner.

In [2]:
combined= pd.read_csv(r"sample_tfidf_corrected.csv",index_col=None)

In [3]:
combined.head(20)

Unnamed: 0,loan_id,description_ENG,status,year,month,funded_amount,loan_amount,loan_name,gender_reclassified,borrower_count,loan_type
0,1382091,Carlos studied up to high school. He did not m...,expired,2017,9,675.0,1000.0,Carlos Geovanny,male,1,individual
1,1101771,"Edwin, 37 years old, lives with his wife and h...",expired,2016,6,100.0,400.0,Edwin Victor,male,1,individual
2,893806,"Ana, age 34, lives with her life partner and t...",expired,2015,5,450.0,800.0,ANA DOLORES,female,1,individual
3,875440,"Featured in the above picture is Peter, who ha...",expired,2015,4,800.0,1250.0,Peter's Group,male,12,group
4,813862,Qurbongul is a resident of the Khuroson distri...,expired,2014,12,1200.0,1950.0,Qurbongul,female,1,individual
5,1396192,Colman is the group leader of his group in Kil...,expired,2017,10,75.0,550.0,Colman's Group,male,7,group
6,999018,Santos is 44 years old and sells car parts. He...,expired,2015,12,425.0,1300.0,SANTOS,male,1,individual
7,1034599,Bright is a 27 year old hardworking married ma...,expired,2016,3,475.0,750.0,Bright,male,1,individual
8,1391541,Ashi is a 36-year-old married woman and the mo...,expired,2017,10,175.0,400.0,Ashi,female,1,individual
9,578104,Omar is a 52-year-old father of five children....,expired,2013,6,825.0,2000.0,Omar,male,1,individual


# Text cleaning

This module performs some preprocessing steps to clean the descriptions. The following steps are used:
   * remove special characters (e.g. \\r\\n, <br //>, ", (.*\), -, ...)
   * lowercase
   * remove HTML code
   * remove accents on words with unidecode
   * remove names of borrowers (currently not used)
   * lemmatization
   * remove stopwords
   * remove digits

In [4]:
combined['description_ENG_Parsed_1'] = combined['description_ENG']

for i in range(len(combined)):
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG'][i].replace("\\r\\n", " ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("\\n\\n", " ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("\\n", " ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("\\r", " ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("<br>", "  ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("<br />", "  ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("<br /><br />", "  ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("<br><br>", "  ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("<br/><br/>", "  ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("-", " ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("\\t", " ")
    combined['description_ENG_Parsed_1'][i] = combined['description_ENG_Parsed_1'][i].replace("'"," ")
    
    



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas

In [5]:
# Lowercasing the text
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_1']
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].str.lower()


In [6]:
# special characters
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].str.replace('"', '')
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].str.replace('“', '')
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].str.replace('”', '')
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].str.replace(r'\(.*\)', "")

In [7]:
# HTML code
from bs4 import BeautifulSoup
combined['description_ENG_Parsed_2'] = [BeautifulSoup(text, 'html.parser').get_text() for text in combined['description_ENG_Parsed_2'] ]


In [8]:
#!pip install Unidecode
# Remove apostrophe and accents
from unidecode import unidecode
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].str.replace("'s", "")
combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].apply(unidecode)

In [9]:
# Punctuation
punctuation_signs = list("?:!.,;")
for punct_sign in punctuation_signs:
    combined['description_ENG_Parsed_2'] = combined['description_ENG_Parsed_2'].str.replace(punct_sign, '')

In [10]:
# Remove names of borrowers from descriptions

#remove NaNs
df = combined.copy()
df = df.dropna(subset=['loan_name'])
df = df[~df['loan_name'].str.contains("Group")]
df = df[~df['loan_name'].str.contains("group")]
#lowercase
df['loan_name_processed'] = df['loan_name'].str.lower()
#remove words which have less than 3 characters
df['loan_name_processed'] = df['loan_name_processed'].str.replace(r'\b(\w{1,2})\b','')
#set unicode
df['loan_name_processed'] = df['loan_name_processed'].apply(unidecode)
#remove ' and backslash
df['loan_name_processed'] = df['loan_name_processed'].str.replace("\\", "")
df['loan_name_processed'] = df['loan_name_processed'].str.replace("'", "")
df['loan_name_processed'] = df['loan_name_processed'].str.replace("–", "")
df['loan_name_processed'] = df['loan_name_processed'].str.replace(r'\(.*\)', "")
df['loan_name_processed'] = df['loan_name_processed'].str.replace(".", "")



#tokenize
import nltk
from nltk.tokenize import word_tokenize
df['loan_name_processed_2'] = df.apply(lambda row: nltk.word_tokenize(row['loan_name_processed']), axis=1)
#remove words which have less than 3 characters
df['loan_name_processed_2'] = df['loan_name_processed_2'].replace(r'\b(\w{1,2})\b', '')


def remove_words_from_text_body(row):
    # Seperate the words to remove by the space between them
    words_to_remove = row['loan_name_processed'].split(" ")

    # Get the text_body as a starting template
    text_body = row['description_ENG_Parsed_2']

    # For each word that we want to remove, replace it with "" (blank)
    for word in words_to_remove:
        text_body = text_body.replace(word, "")

    return text_body

df['description_ENG_Parsed_3'] = df.apply(remove_words_from_text_body, axis=1)
combined['description_ENG_Parsed_3'] = df['description_ENG_Parsed_3'].copy()
combined['description_ENG_Parsed_3']=combined['description_ENG_Parsed_3'].combine_first(combined['description_ENG_Parsed_2'])


In [11]:
# lemmatization

from nltk.stem import WordNetLemmatizer

# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

nrows = len(combined)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = combined.loc[row]['description_ENG_Parsed_2']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)
    
combined['description_ENG_Parsed_4'] = lemmatized_text_list

In [13]:
# Loading the stop words in english
from nltk.corpus import stopwords
stop_words = list(stopwords.words('english'))
newStopWords = ['years','year','to','be','and','a','are','the','in','of','have','for','with','also',
               'business','loan','children','family','buy','work','sell','live','old','marry',
                'income','request','help','use','future','earn','like','support','make','purchase','need',
               'school','farm','save','husband','provide','able','hard','hop','improve','want','expand',
                'one','store','increase','would','dream','house','pay','group','good','time','profit',
                'home','enough','order','kiva','run','grow','products','food','customers','raise','life',
                'fee','build','get','better','expense','new','market','well','day','start','small','clothe',
                'him','community','village','continue','take','items','partner','stock','general','age',
                'meet','plan','area','apply','capital','go','supply','education','farmer','shop','since',
                'financial','operate','quality','ask','access','child','amount','household','sales','repay',
                'service','send','thank','challenge']
noise_words= ['rd','th','le','p','f','et','u','rs','fe','ue','u','e','f','m','b','st','nd','ly','s','al','la','ger'
              ,'am','pm','etc','et','ms','en','de','el','la','lan','are']
stop_words.extend(noise_words)
#stop_words.extend(newStopWords)

In [14]:
deleteStopWords = ['he','him','his','himself','she',"she's",'her','hers','herself']
for i in deleteStopWords:
    if i in stop_words:
        stop_words.remove(i)

In [15]:
# transforming numerical terms to digits

combined['description_ENG_Parsed_7'] = combined['description_ENG_Parsed_4'].copy()
from text_to_num import alpha2digit
for i in range(len(combined)):
    combined['description_ENG_Parsed_7'][i] = alpha2digit(combined['description_ENG_Parsed_7'][i], "en")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
# remove digits and other things
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_7'].str.replace('\d+', '', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace("'",'', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('[','', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('[','', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace(']','', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('\\',' ', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('\\',' ', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].map(lambda x: x.lstrip('\\ue').rstrip('aAbBcC'))
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('\\','')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].map(lambda x: x.lstrip(r"\ub\ub").rstrip('aAbBcC'))
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('*','', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('+','', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('(','', regex=True)
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace(')','', regex=True)


In [17]:
# removing stopwords
combined['description_ENG_Parsed_9'] = combined['description_ENG_Parsed_8'].copy()
for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    combined['description_ENG_Parsed_9'] = combined['description_ENG_Parsed_9'].str.replace(regex_stopword, '')

## Group similar words with word2vec 

### Unigrams

In [18]:
from gensim.models import Word2Vec
from gensim.models import word2vec
from sklearn.manifold import TSNE
import multiprocessing

In [19]:
cores = multiprocessing.cpu_count()

In [20]:
cores

4

In [38]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [39]:
sentences = [row.split() for row in combined['description_ENG_Parsed_9']]

In [40]:
# averageLen(sentences) is 72.6771

In [41]:
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter)
w2v_model.init_sims(replace=True)

  


In [42]:
wordlist = list(w2v_model.wv.vocab)

In [43]:
len(wordlist)

3674

In [44]:
type(wordlist)

list

In [57]:
wordlist3= list()

In [58]:
for string in wordlist:
    if (string != ""):
        wordlist3.append(string)

In [55]:
len(wordlist3)

3674

In [48]:
def check(word, list):
    if word in list:
        print("The word is in the list!")
    else:
        print("The word is not in the list!")

In [49]:
check('le',wordlist3)

The word is not in the list!


In [68]:
len(combined[combined['description_ENG_Parsed_9'].str.contains(" philippines ")])

1878

In [77]:
for i in wordlist3:
    if len(combined[combined['description_ENG_Parsed_9'].str.contains(str("")+i+str(" "))]) != False:
        if combined['description_ENG_Parsed_9'].str.contains(str("")+i+str(" ")).value_counts()[True]>2000:
            for j in wordlist3:
                if len(combined[combined['description_ENG_Parsed_9'].str.contains(str("")+j+str(" "))]) != False:
                    if combined['description_ENG_Parsed_9'].str.contains(str("")+j+str(" ")).value_counts()[True]>2000:
                        if i!=j:
                            if w2v_model.wv.similarity(i,j)>0.90:
                                print('output',' ',i,j)
                                print(w2v_model.wv.similarity(i,j))

output   continue able
0.9060978
output   old age
0.9363674
output   old marry
0.96767807
output   buy purchase
0.93079257
output   quality better
0.9366164
output   like would
0.9463899
output   purchase buy
0.93079257
output   able continue
0.9060978
output   age old
0.9363674
output   age marry
0.9356429
output   age mother
0.9408483
output   request php
0.9086565
output   marry old
0.96767807
output   marry age
0.9356429
output   expand future
0.9023411
output   mother age
0.9408483
output   would like
0.9463899
output   future expand
0.9023411
output   better quality
0.9366164
output   enough money
0.93115366
output   enough save
0.9056247
output   money enough
0.93115366
output   money save
0.9516916
output   save enough
0.9056247
output   save money
0.9516916
output   php request
0.9086565
output   fit art
0.9268811
output   fit arm
0.90133077
output   fit ease
0.9120198
output   art fit
0.9268811
output   arm fit
0.90133077
output   arm prove
0.9261149
output   arm row
0.933193

In [61]:
w2v_model.wv.similarity('she','her')

0.8893805

In [None]:
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('his','he')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('him','he')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('children','child')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('job','work')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('his','he')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('purchase','buy')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('farm','farmer')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('dream','goal')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('provide','give')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('better','improve')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('her','she')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('request','ask')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('grateful','thank')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('learn','experience')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('ask','request')
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('house','home')


In [75]:
w2v_model.wv.similarity('','')

0.6220141

In [None]:
combined['description_ENG_Parsed_8'] = combined['description_ENG_Parsed_8'].str.replace('php','philippines')

In [None]:
combined['description_ENG_Parsed_8'].str.contains('he').any()

# TF-IDF model

## Create TF-IDF features

In [None]:
status_codes = {
    'funded': 1,
    'expired': 2
}

In [None]:
# Category mapping
combined['status_Code'] = combined['status']
combined = combined.replace({'status_Code':status_codes})

In [None]:
combined['status_Code'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(combined['description_ENG_Parsed_8'], 
                                                    combined['status_Code'], 
                                                    test_size=0.20, 
                                                    random_state=8)

In [None]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 0.7
#1. = 100%
max_features = None

In [None]:
#TF-IDF 

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                        strip_accents='unicode')

features_train = tfidf.fit_transform(X_train).toarray()
features_train2 = tfidf.fit_transform(X_train)
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
features_test2 = tfidf.transform(X_test)
labels_test = y_test
print(features_test.shape)

In [None]:
dictionary = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

## 2D-plot of TF-IDF features

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

In [None]:
features = np.concatenate((features_train,features_test), axis=0)
labels = np.concatenate((labels_train,labels_test), axis=0)

In [None]:
def plot_dim_red(model, features, labels, n_components=2):
    
    # Creation of the model
    if (model == 'PCA'):
        mod = PCA(n_components=n_components)
        title = "PCA decomposition of top 5000 features"  # for the plot
        
    elif (model == 'TSNE'):
        mod = TSNE(n_components=2)
        title = "t-SNE decomposition" 

    else:
        return "Error"
    
    # Fit and transform the features
    principal_components = mod.fit_transform(features)
    
    # Put them into a dataframe
    df_features = pd.DataFrame(data=principal_components,
                     columns=['PC1', 'PC2'])
    
    # Now we have to paste each row's label and its meaning
    # Convert labels array to df
    df_labels = pd.DataFrame(data=labels,
                             columns=['label'])
    
    df_full = pd.concat([df_features, df_labels], axis=1)
    df_full['label'] = df_full['label'].astype(str)

    # Get labels name
    category_names = {
        "1": 'Funded loans',
        "2": 'Expired loans'
    }

    # And map labels
    df_full['label_name'] = df_full['label']
    df_full = df_full.replace({'label_name':category_names})

    # Plot
    plt.figure(figsize=(10,10))
    sns.scatterplot(x='PC1',
                    y='PC2',
                    hue="label_name", 
                    data=df_full,
                    palette=["red","blue"],
                    alpha=.7).set_title(title);

In [None]:
plot_dim_red("PCA", 
             features=features, 
             labels=labels,
             n_components=2)

## Top n TF-IDF scores for each bin

In [None]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [None]:
def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

In [None]:
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

In [None]:
def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

In [None]:
x = top_feats_by_class(features_train2, labels_train, tfidf.get_feature_names(), min_tfidf=0.1, top_n=25)

# Logistic regression + TF-IDF

In this module we fit a logistic regression model on the tf-idf features. By adjusting the C parameter we obtain a test set accuracy of 76%. Next, we construct a confusion matrix and plot the most important words for each bin. Finally, we have a look at some misclassified descriptions.

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=2, solver='sag')
logreg = logreg.fit(features_train, labels_train)
y_pred = logreg.predict(features_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

print('accuracy %s' % accuracy_score(y_pred, labels_test))
print(classification_report(labels_test, y_pred,target_names=['bin 1','bin 2']))

## Confusion matrix + ROC curve

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
aux_df = combined[['status', 'status_Code']].drop_duplicates().sort_values('status_Code')
conf_matrix = confusion_matrix(labels_test, y_pred)
plt.figure(figsize=(6,6))
ax = sns.heatmap(conf_matrix, 
            annot=True,
            xticklabels=aux_df['status'].values, 
            yticklabels=aux_df['status'].values,
            cmap="Blues", fmt='g')
ax.set_ylim([0,2])
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.title('Confusion matrix')
plt.show()

In [None]:
# ROC curve

import sklearn.metrics as metrics
# calculate the fpr and tpr for all thresholds of the classification
probs = logreg.predict_proba(features_test)
preds = probs[:,0]
y_true = labels_test.values
fpr, tpr, threshold = metrics.roc_curve(y_true, preds, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Most important words for each bin

    * Five most important words to classify funded loans: {she; widow; toilet; machine; tuition}
    * Five most important words to classify expired loans: {his; he; clothe; store; products}


In [None]:
def get_most_important_features(vectorizer, model, n=5):
    index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
    
    # loop for each class
    classes ={}
    for class_index in range(model.coef_.shape[0]):
        word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]
        sorted_coeff = sorted(word_importances, key = lambda x : x[0], reverse=True)
        tops = sorted(sorted_coeff[:n], key = lambda x : x[0])
        bottom = sorted_coeff[-n:]
        classes[class_index] = {
            'tops':tops,
            'bottom':bottom
        }
    return classes

importance = get_most_important_features(tfidf, logreg, 20)

In [None]:
def plot_important_words(top_scores, top_words, bottom_scores, bottom_words, name):
    y_pos = np.arange(len(top_words))
    top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
    top_pairs = sorted(top_pairs, key=lambda x: x[1])
    
    bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]
    bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)
    
    top_words = [a[0] for a in top_pairs]
    top_scores = [a[1] for a in top_pairs]
    
    bottom_words = [a[0] for a in bottom_pairs]
    bottom_scores = [a[1] for a in bottom_pairs]
    
    fig = plt.figure(figsize=(10, 10))  

    plt.subplot(121)
    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
    plt.title('Funded loans', fontsize=20)
    plt.yticks(y_pos, bottom_words, fontsize=14)
    plt.suptitle('Key words', fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplot(122)
    plt.barh(y_pos,top_scores, align='center', alpha=0.5)
    plt.title('Expired loans', fontsize=20)
    plt.yticks(y_pos, top_words, fontsize=14)
    plt.suptitle(name, fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplots_adjust(wspace=0.8)
    plt.show()

top_scores = [a[0] for a in importance[0]['tops']]
top_words = [a[1] for a in importance[0]['tops']]
bottom_scores = [a[0] for a in importance[0]['bottom']]
bottom_words = [a[1] for a in importance[0]['bottom']]

plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words")

## Misclassified descriptions

In [None]:
status_names = {
    1: 'funded',
    2: 'expired'
}

# Indexes of the test set
index_X_test = X_test.index

# We get them from the original df
df_test = combined.loc[index_X_test]

# Add the predictions
df_test['Prediction'] = y_pred

# Clean columns
df_test = df_test[['description_ENG_Parsed_8','description_ENG', 'status', 'status_Code', 'Prediction','funded_amount','loan_amount']]

# Decode
df_test['status_Predicted'] = df_test['Prediction']
df_test = df_test.replace({'status_Predicted':status_names})

# Clean columns again
df_test = df_test[['description_ENG_Parsed_8','description_ENG', 'status', 'status_Predicted','funded_amount','loan_amount']]

condition = (df_test['status'] != df_test['status_Predicted'])

df_misclassified = df_test[condition]

def output_article(row_article):
    print('Actual status: %s' %(row_article['status']))
    print('Predicted status: %s' %(row_article['status_Predicted']))
    print('-------------------------------------------')
    print('Text: ')
    print('%s' %(row_article['description_ENG_Parsed_8']))
    print('%s' %(row_article['description_ENG']))
    
import random
random.seed(8)
list_samples = random.sample(list(df_misclassified.index), 3)

output_article(df_misclassified.loc[list_samples[0]])
output_article(df_misclassified.loc[list_samples[1]])
output_article(df_misclassified.loc[list_samples[2]])

# Grid search

## Randomized Search Cross Validation

In [None]:
from pprint import pprint
# C
C = [float(x) for x in np.linspace(start = 1, stop = 3, num = 10)]

# solver
solver = ['newton-cg', 'sag', 'saga', 'lbfgs']

# class_weight
class_weight = ['balanced', None]

# penalty
penalty = ['l2']

# Create the random grid
random_grid = {'C': C,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

pprint(random_grid)

In [None]:
# First create the base model to tune
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=8)

# Definition of the random search
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=logreg,
                                   param_distributions=random_grid,
                                   n_iter=5,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

# Fit the random search model
random_search.fit(features_train, labels_train)

In [None]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

##  Grid Search Cross Validation

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
C = [float(x) for x in np.linspace(start = 1.8, stop = 2.2, num = 5)]
solver = ['sag']
class_weight = ['None']
penalty = ['l2']

param_grid = {'C': C,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

# Create a base model
lrc = LogisticRegression(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=lrc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

In [None]:
print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

In [None]:
best_lrc = grid_search.best_estimator_
lrc_pred = best_lrc.predict(features_test)

# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_lrc.predict(features_train)))

# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, best_lrc.predict(features_test)))

from sklearn.metrics import classification_report
# Classification report
print("Classification report")
print(classification_report(labels_test,lrc_pred))

# Other classifiers

In [None]:
from sklearn import model_selection, naive_bayes
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [None]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB(alpha=0.1)
Naive.fit(features_train,labels_train)# predict the labels on validation dataset
predictions_NB = Naive.predict(features_test)# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, labels_test)*100)

In [None]:
# Classifier - Algorithm - SVC
# fit the training dataset on the classifier
SVC = LinearSVC(random_state=0, tol=1e-5, C=0.7, loss='hinge')
SVC.fit(features_train,labels_train)# predict the labels on validation dataset
predictions_SVC = SVC.predict(features_test)# Use accuracy_score function to get the accuracy
print("SVC Accuracy Score -> ",accuracy_score(predictions_SVC, labels_test)*100)

# Check gender bias

The results of the logistic regression model suggest that a loan will be fully funded when the words 'her' and 'she' are mentioned while a loan will be expired when the words 'he' and 'his' are mentioned. This should be investigated.

In [None]:
combined['gender_reclassified'].value_counts()

In [None]:
combined.groupby(['gender_reclassified', 'status']).size()

In [None]:
# Table of target_variable vs gender
target_gender = pd.crosstab(index=combined["gender_reclassified"], 
                            columns=combined["status"],
                             margins=True)   # Include row and column totals
target_gender.columns = ["expired","funded","coltotal"]
target_gender.index= ["female","male","rowtotal"]
target_gender


In [None]:
target_gender_proportions=target_gender.div(target_gender["coltotal"],
                   axis=0)
target_gender_proportions

In [None]:
target_gender_proportions=target_gender_proportions.loc[['female', 'male'], :'expired']
target_gender_proportions.plot(kind="bar", 
                 figsize=(8,8),
                 stacked=False)