# Project 1: Bank Credit
### adrianty & sondrewo

### The implementation of the functions for `name_banker.py` can be found in the source code folder.

We started our model development by inspecting the different columns in the data set, identifying both numerical and discrete  features. As is known, the Naive Bayes classifier supports catergorical features natively and can be adjusted to use numerical ones as well. Thus, we formulated the following hypothesis:

H<sub>0</sub> : The Multinomial Naive Bayes classifier will provide a high accuracy score.

We then attempted to falsify this hypothesis by testing out different models: Logistic regression, KNN and the Baussian and Bernoillu NB.

In [17]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.naive_bayes import BernoulliNB 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import pandas
import matplotlib.pyplot as plt

In [21]:
PATH = "./data/credit/D_valid.csv"
features = ['checking account balance', 'duration', 'credit history',
            'purpose', 'amount', 'savings', 'employment', 'installment',
            'marital status', 'other debtors', 'residence time',
            'property', 'age', 'other installments', 'housing', 'credits',
            'job', 'persons', 'phone', 'foreign']
target = 'repaid'

df = pandas.read_csv(PATH, sep=' ',
                     names=features+[target])

In [22]:
numerical_features = ['duration', 'age', 'residence time', 'installment', 'amount', 'persons', 'credits']
quantitative_features = list(filter(lambda x: x not in numerical_features, features))
X = pandas.get_dummies(df, columns=quantitative_features, drop_first=True)
encoded_features = list(filter(lambda x: x != target, X.columns))

In [23]:
def test_decision_maker(X_test, y_test, interest_rate, decision_maker):
    n_test_examples = len(X_test)
    utility = 0

    ## Example test function - this is only an unbiased test if the data has not been seen in training
    total_amount = 0
    total_utility = 0
    decision_maker.set_interest_rate(interest_rate)
    for t in range(n_test_examples):
        action = decision_maker.get_best_action(X_test.iloc[t])
        good_loan = y_test.iloc[t] # assume the labels are correct
        duration = X_test['duration'].iloc[t]
        amount = X_test['amount'].iloc[t]
        # If we don't grant the loan then nothing happens
        if (action==1):
            if (good_loan != 1):
                utility -= amount
            else:    
                utility += amount*(pow(1 + interest_rate, duration) - 1)
        total_utility += utility
        total_amount += amount
    return utility, total_utility/total_amount

In [38]:
import name_banker
from sklearn.model_selection import train_test_split

models = {"KNN": KNeighborsClassifier(n_neighbors=5), "BernoulliNB": BernoulliNB(), "MultinomialNB": MultinomialNB(), "Log.regression": LogisticRegression(max_iter=1500)}

interest_rate = 0.017
results = {}

n_tests = 100

### Do a number of preliminary tests by splitting the data in parts
def run_test():
    for name, model in models.items():
        print(name)
        decision_maker = name_banker.NameBanker(model)
        utility = 0
        investment_return = 0
        for iter in range(n_tests):
            X_train, X_test, y_train, y_test = train_test_split(X[encoded_features], X[target], test_size=0.2)
            decision_maker.set_interest_rate(interest_rate)
            decision_maker.fit(X_train, y_train)
            Ui, Ri = test_decision_maker(X_test, y_test, interest_rate, decision_maker)
            utility += Ui
            investment_return += Ri
        results[name] = [utility, investment_return]

### Run experiments with n_tests = 100, for all the models:

In [39]:
run_test()

KNN
BernoulliNB
MultinomialNB
Log.regression


In [52]:
pandas.DataFrame(results.items(), columns=["Model", "Total Utility, Avg Investment Return"])

Unnamed: 0,Model,"Total Utility, Avg Investment Return"
0,KNN,"[85413668.0349148, 1860.763931366687]"
1,BernoulliNB,"[194758796.3967455, 4966.488676620424]"
2,MultinomialNB,"[535516088.42545235, 12890.98462595138]"
3,Log.regression,"[140809454.4726863, 3353.135428685509]"


Based on these results, we then chose to keep our hypothesis H<sub>0</sub> and continue the development using the Multinomial NB model.