In [126]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Setup Complete")

In [127]:
#Importing the data base 
proposerloan_filepath = "../input/prosper-loan/prosperLoanData.csv"
df = pd.read_csv(proposerloan_filepath)
np.random.seed(0) 
print("Loding data Complete")

In [128]:
df.columns

In [129]:
print(df.head())

In [130]:
print(df.info())

In [131]:
#descriptive stats of the dataframe
df.describe()

# Let's try to simplify the loan status feature variables : Predict variable (desired target)

In [132]:
df['LoanStatus'].value_counts()

In [133]:
# those are current, completed and cancelled are categorize as completed (1)otherwise defaulted(0)
df['LoanStatus'] = np.where((df['LoanStatus'] == 'Current')|(df['LoanStatus'] == 'Completed')|(df['LoanStatus'] == 'Cancelled'), 
                         1, 0)

In [134]:
plt.figure(figsize=(5,7))
#ploting 
ax=sns.countplot(x='LoanStatus', data=df, palette='BuPu' )
plt.xlabel('Loan Status')
plt.ylabel('Occurence')
#annotating the percentage
total = float(len(df))
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='right')
plt.show()

In [135]:
from sklearn.preprocessing import LabelEncoder
le_Occupation = LabelEncoder()
le_IncomeRange = LabelEncoder()
le_BorrowerState = LabelEncoder()
le_CreditGrade = LabelEncoder()

In [136]:
df['Occupation'] = le_Occupation.fit_transform(df['Occupation'])
df['IncomeRange'] = le_IncomeRange.fit_transform(df['IncomeRange'])
df['BorrowerState'] = le_BorrowerState.fit_transform(df['BorrowerState'])
df['CreditGrade'] = le_CreditGrade.fit_transform(df['CreditGrade'])

In [137]:
df['EmploymentStatus'] = np.where((df['EmploymentStatus'] == 'Employed'), 
                         1, 0)
df['IsBorrowerHomeowner'] = np.where((df['IsBorrowerHomeowner'] == True), 
                         1, 0)


# Filling the NaN values

In [138]:
df=df.fillna(0)

# Extracting X and Y datasets

In [139]:
#extracting x and y from dataframe
cells=['ListingNumber','LoanStatus','ListingKey','ListingCreationDate','ClosedDate'
,'ProsperRating (Alpha)','GroupKey','DateCreditPulled','FirstRecordedCreditLine','LoanKey','LoanOriginationDate'
,'LoanOriginationQuarter','MemberKey' ]
x=df.drop(columns=cells, axis=1)
y=df['LoanStatus']
print(y.value_counts(normalize=True)*100)
print(y.value_counts(normalize=True)*100)
x.head(10)

# Splitting the data to train and test

In [140]:
#spliting data to train and test
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=1, stratify=y)
print(x.shape,x_train.shape,x_test.shape)
print(y.shape,y_train.shape,y_test.shape)
print(y_train.value_counts(normalize=True)*100)
print(y_test.value_counts(normalize=True)*100)

# Deleting the dependent features with correlation higher then 0.95

In [141]:
cor = x_train.corr()
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [142]:
corr_features = correlation (x_train, 0.95)
x_train=x_train.drop(columns=corr_features , axis=1)
x_test=x_test.drop(columns=corr_features , axis=1)
n=len(set(corr_features))
print(n,' droped features :','\n',corr_features)


In [143]:
x.columns

# Fitting the model

In [144]:
# Logistic Regression
model = tree.DecisionTreeClassifier()
model.fit(x_train,y_train)

In [145]:
cross_val_score(model,x_train, y_train)

# Performing prediction using the test dataset

In [147]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print('model score : ',model.score(x_test,y_test))
print('\n\n',confusion_matrix(y_test,y_pred))
print('\n\n',classification_report(y_test,y_pred))
print('\n\n',accuracy_score(y_test, y_pred))

# Display the confusion Matrix

In [148]:
con_matrix = confusion_matrix(y_test,y_pred)
test_score = model.score(x_test,y_test) 
plt.figure(figsize=(8,8))
sns.heatmap(con_matrix, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'BuGn_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(test_score)
plt.title(all_sample_title, size = 15);