In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import string
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

Reading data

In [2]:
loan = pd.read_csv('loan.csv')

  interactivity=interactivity, compiler=compiler, result=result)


Checking data and removing nulls

In [3]:
loan = loan[loan['desc'].notnull()]

In [4]:
loan.shape

(126067, 145)

In [5]:
loan['loan_status'].unique()

array(['Charged Off', 'Fully Paid', 'Current', 'Late (16-30 days)',
       'Late (31-120 days)', 'In Grace Period',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

Creating dataframe with required columns

In [6]:
l_c=['desc','loan_status']
loan=loan[l_c]
loan.shape

(126067, 2)

Assigning loan Statuses to create a binary division of good and bad loans

In [7]:
loan['loan_status'] = loan['loan_status'].map({'Charged Off': 1, 'Fully Paid': 1,'Current':np.NaN,'Does not meet the credit policy. Status:Fully Paid':0,'Does not meet the credit policy. Status:Charged Off':0,'In Grace Period':0,'Late (16-30 days)':0,'Late (31-120 days)':0})
loan.shape

(126067, 2)

Prepared data for NLP

In [8]:
loan.dropna(axis=0,inplace=True)
loan.shape

(125774, 2)

In [9]:
loan['loan_status'].unique()

array([1., 0.])

Tokenizing

In [10]:
loan['desc_tok']=loan.apply(lambda row: nltk.word_tokenize(row['desc']), axis=1)

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anant\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Removing stop words and punctuations

In [12]:
useless_words = nltk.corpus.stopwords.words('english')+list(string.punctuation)

In [13]:
loan['tok']=loan.apply(lambda row:[word for word in row['desc_tok'] if not word in useless_words], axis=1)

Stemming

In [14]:
ps = PorterStemmer()
loan['stem']=loan.apply(lambda row:[ps.stem(word) for word in row['tok']], axis=1)

In [15]:
ps = PorterStemmer()
loan['last_out']=loan.apply(lambda row:[''.join(word) for word in row['stem'] if word not in string.punctuation],axis=1)

Creating final dataframe for NLP

In [16]:
loan['out'] = loan.last_out.map(lambda x: ' '.join(x))

In [17]:
loan=loan[['loan_status','out']]

Utilizing CountVectorizer

In [18]:
count_vect = CountVectorizer()  
counts = count_vect.fit_transform(loan['out'])

Creating Naive Bayes Model to predict status of loan from description

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(counts, loan['loan_status'], test_size=0.25, random_state=0)

from sklearn import metrics

model = MultinomialNB().fit(X_train, y_train)
predicted=model.predict(X_test)
predicted_train=model.predict(X_train)

print("Accuracy_rate_train:",metrics.accuracy_score(y_train, predicted_train))

print("Accuracy_rate_test:",metrics.accuracy_score(y_test, predicted))

Accuracy_rate_train: 0.9668504187427117
Accuracy_rate_test: 0.9627909935122758
