In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

In [2]:
df = pd.read_csv('Bank_Personal_Loan_Modelling.csv')

In [3]:
ran_forest_classifier = RandomForestClassifier(random_state=42)

In [4]:
model_features = df.columns.drop(["ID","PersonalLoan"])
model_target = "PersonalLoan"

In [5]:
train_data, test_data = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

In [6]:
print('Class 0 samples in the training set:', sum(train_data[model_target] == 0))
print('Class 1 samples in the training set:', sum(train_data[model_target] == 1))

Class 0 samples in the training set: 3625
Class 1 samples in the training set: 375


In [7]:
td_target_0 = train_data[train_data[model_target] == 0]
td_target_1 = train_data[train_data[model_target] == 1]
upsampled_td_target_1 = td_target_1.sample(n=len(td_target_0), replace=True, random_state=42)
train_data = pd.concat([td_target_0, upsampled_td_target_1])
train_data = shuffle(train_data)
print('Class 0 samples in the training set:', sum(train_data[model_target] == 0))
print('Class 1 samples in the training set:', sum(train_data[model_target] == 1))

Class 0 samples in the training set: 3625
Class 1 samples in the training set: 3625


In [8]:
X_train = train_data[model_features]
y_train = train_data[model_target]

In [9]:
X_test = test_data[model_features]
y_test = test_data[model_target]

In [10]:
ran_forest_classifier.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [11]:
test_predictions = ran_forest_classifier.predict(X_test)

In [12]:
print('Model performance on the test set:')
print(confusion_matrix(y_test, test_predictions))
print(classification_report(y_test, test_predictions))
print("Test accuracy:", accuracy_score(y_test, test_predictions))

Model performance on the test set:
[[892   3]
 [  5 100]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       895
           1       0.97      0.95      0.96       105

    accuracy                           0.99      1000
   macro avg       0.98      0.97      0.98      1000
weighted avg       0.99      0.99      0.99      1000

Test accuracy: 0.992


In [13]:
import pickle
file_pickle = open('model.pkl', 'wb')
pickle.dump(ran_forest_classifier, file_pickle)
file_pickle.close()

In [14]:
new_model_file = open('model.pkl', 'rb')
new_model=pickle.load(new_model_file)

In [15]:
label = {0: 'negative', 1: 'positive'}
customer_data = df[model_features].head(1)

In [16]:
print('Prediction: %s' % (label[new_model.predict(customer_data)[0]])) 

Prediction: negative


In [17]:
print('Probability: %.2f%%' % np.max(new_model.predict_proba(customer_data) * 100))

Probability: 100.00%


In [18]:
customer_data = df[model_features].head(10)
customer_data.loc[9:]

Unnamed: 0,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,SecuritiesAccount,CDAccount,Online,CreditCard
9,34,9,180,93023,1,8.9,3,0,0,0,0,0


In [19]:
customer_data = customer_data.loc[9:]
customer_data

Unnamed: 0,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,SecuritiesAccount,CDAccount,Online,CreditCard
9,34,9,180,93023,1,8.9,3,0,0,0,0,0


In [20]:
print('Prediction: %s' % (label[new_model.predict(customer_data)[0]])) 
print('Probability: %.2f%%' % np.max(new_model.predict_proba(customer_data) * 100))

Prediction: positive
Probability: 100.00%


In [21]:
customer_data['Income'] = 100
customer_data['CCAvg'] = 7

In [22]:
print('Prediction: %s' % (label[new_model.predict(customer_data)[0]])) 
print('Probability: %.2f%%' % np.max(new_model.predict_proba(customer_data) * 100))

Prediction: positive
Probability: 55.00%
