# Predictions on whether a loan applicant might default

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

## Exploring the dataset

In [None]:
import pandas as pd

train_df = pd.read_csv("lc_trainingset.csv")
test_df = pd.read_csv("lc_testset.csv")

train_df

In [None]:
print(train_df.shape)
print(test_df.shape)

## Explore loan_status in train_df

In [None]:
train_df['loan_status'].value_counts()

In [None]:
import plotly.express as px

fig = px.bar(train_df['loan_status'].value_counts(), width=800, height=500)
fig.show()

## Relabelling the Loan Status

In [None]:
def change_loan_status(loan_status):
    if loan_status in ['Fully Paid', 'Current']:
        return 0
    else:
        return 1

train_df['loan_status'] = train_df['loan_status'].apply(change_loan_status)
train_df.head(20)

In [None]:
train_df['loan_status'].value_counts()

In [None]:
threshold = 0.3

train_df['mort_acc'] = train_df['mort_acc'].fillna(train_df['mort_acc'].mode()[0])
train_df['pub_rec_bankruptcies'] = train_df['pub_rec_bankruptcies'].fillna(train_df['pub_rec_bankruptcies'].mode()[0])
train_df['emp_length'] = train_df['emp_length'].fillna(train_df['emp_length'].mode()[0])
train_df['revol_util'] = train_df['revol_util'].fillna(train_df['revol_util'].median())
print(train_df.isnull().sum())
# train_df

# Exploratory Data Analysis

In [None]:
import plotly.express as px

fig = px.scatter_matrix(train_df,
                        dimensions=['loan_amnt', 'term', 'int_rate', 'revol_util', 'home_ownership', 'emp_length'],
                        color="loan_status")

fig.update_traces(diagonal_visible=False)

fig.update_traces(marker=dict(size=5,colorscale='Bluered',line=dict(width=0.5,color='rgb(230,230,230)')))

fig.update_layout(title="Scatterplot Matrix for LendingClub Dataset",
                  dragmode='select',
                  width=1000,
                  height=1000)

fig.show()

# Feature Engineering

In [None]:
import re
from sklearn.preprocessing import LabelEncoder

upper_lim = train_df['loan_amnt'].quantile(.95)
lower_lim = train_df['loan_amnt'].quantile(.05)

train_df = train_df[(train_df['loan_amnt'] < upper_lim) & (train_df['loan_amnt'] > lower_lim)]
train_df

train_df['loan_amnt'] = (train_df['loan_amnt'] - train_df['loan_amnt'].min()) /(train_df['loan_amnt'].max() - train_df['loan_amnt'].min())
train_df



label_encoder = LabelEncoder()
# Sieve out the columns which we want to apply label encoding on
train_df['grade'] = label_encoder.fit_transform(train_df['grade'])
train_df['initial_list_status'] = label_encoder.fit_transform(train_df['initial_list_status'])
train_df['application_type'] = label_encoder.fit_transform(train_df['application_type'])

train_df['postal_code'] = train_df['address'].str.extract(r'(\d{5})$').astype(int)


ownership_OH_encoding = pd.get_dummies(train_df['home_ownership']).drop(columns=['NONE','ANY','OTHER'])
train_df = pd.concat([train_df, ownership_OH_encoding], axis=1)

verification_OH_encoding = pd.get_dummies(train_df['verification_status'])
train_df = pd.concat([train_df, verification_OH_encoding], axis=1)


pattern = r"(\d+)"
train_df['emp_length'] = train_df['emp_length'].str.extract(pattern).astype(int)
train_df.head(10)

# Model Building and Model Evaluation

In [None]:
from xgboost import XGBClassifier

features = ['postal_code', 'int_rate', 'loan_amnt','installment', 'annual_inc', 'MORTGAGE', 'OWN', 'RENT', 'emp_length', 'pub_rec_bankruptcies', 'open_acc', 'dti', 'revol_bal', 'Not Verified', 'grade', 'initial_list_status', 'application_type']
X = train_df[features]
y = train_df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)


# Instantiate the model object
model = XGBClassifier(learning_rate=0.1, random_state=5)

# Fit the model with the training data
model.fit(X_train, y_train)

# predict the target on the test dataset
y_predict = model.predict(X_test)
print('\nPrediction on test data', y_predict) 

# Accuracy Score on test dataset
accuracy_test = metrics.accuracy_score(y_test, y_predict)
print('\nAccuracy_score on test dataset : ', accuracy_test)

# Generate and Export Predictions from your Final Model

## Re-fit your final model on lc_trainingset.csv

In [None]:
X = train_df[['postal_code', 'int_rate', 'loan_amnt','installment', 'annual_inc', 'MORTGAGE', 'OWN', 'RENT', 'emp_length', 'pub_rec_bankruptcies', 'open_acc', 'dti', 'revol_bal', 'Not Verified', 'grade', 'initial_list_status', 'application_type']] # Suppose these are your final selected features
y = train_df['loan_status']

print(len(X))
print(len(y))

model = XGBClassifier(learning_rate=0.1, random_state=5)
model.fit(X, y)

## Generate predictions for the test_df

In [None]:
# Replicate the feature engineering you did to the train_df

label_encoder = LabelEncoder()
# Sieve out the columns which we want to apply label encoding on
test_df['grade'] = label_encoder.fit_transform(test_df['grade'])
test_df['initial_list_status'] = label_encoder.fit_transform(test_df['initial_list_status'])
test_df['application_type'] = label_encoder.fit_transform(test_df['application_type'])

test_ownership_OH_encoding = pd.get_dummies(test_df['home_ownership']).drop(columns=['NONE','ANY','OTHER'])
test_df = pd.concat([test_df, test_ownership_OH_encoding], axis=1)
verify_ownership_OH_encoding = pd.get_dummies(test_df['verification_status'])
test_df = pd.concat([test_df, verify_ownership_OH_encoding], axis=1)
# test_df.head()

test_df['mort_acc'] = test_df['mort_acc'].fillna(test_df['mort_acc'].mode()[0])
test_df['pub_rec_bankruptcies'] = test_df['pub_rec_bankruptcies'].fillna(test_df['pub_rec_bankruptcies'].mode()[0])
test_df['emp_length'] = test_df['emp_length'].fillna(test_df['emp_length'].mode()[0])
test_df['revol_util'] = test_df['revol_util'].fillna(test_df['revol_util'].median())

test_df['postal_code'] = test_df['address'].str.extract(r'(\d{5})$').astype(int)

pattern = r"(\d+)"
test_df['emp_length'] = test_df['emp_length'].str.extract(pattern).astype(int)
test_df.head()

print(test_df.shape)

In [None]:
kaggle_x = test_df[['postal_code', 'int_rate', 'loan_amnt','installment', 'annual_inc', 'MORTGAGE', 'OWN', 'RENT', 'emp_length', 'pub_rec_bankruptcies', 'open_acc', 'dti', 'revol_bal', 'Not Verified', 'grade', 'initial_list_status', 'application_type']] # Extract same features used for model training
probabilities = model.predict_proba(kaggle_x) # Use your final model here
probabilities

In [None]:
kaggle_preds = probabilities[:,1]  # Extract probabilities from the rightmost column
len(kaggle_preds)                  # Output should be 78237

Make use of the <code>.to_csv()</code> function to output your predictions in the form of a csv, which will be the format you will be required to submit to Kaggle.

In [None]:
predictions_df = pd.DataFrame({
    'Id': test_df['id'],
    'Predicted': kaggle_preds
})
predictions_df.to_csv('my_predictions.csv', index=False) 