In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pgmpy.models as models
import networkx as nx
from pgmpy.inference import VariableElimination

  from .autonotebook import tqdm as notebook_tqdm


<h2>FAIKR module 3 PROJECT</h2>
<p>The dataset considered is about the earnings and the expenses of 1000 people, with informations about their Credit Score, the objective of this project is to try to define a Bayesian Network that can describe the causal links between the expenses, the earnings and the other factors considered in this analisys.</p>
<p>the dataset considered is available at <a href="https://www.kaggle.com/datasets/conorsully1/credit-score">this link</a> on kaggle.com.</p>

In [2]:
partial_url='./credit_score_dataset'
df=pd.read_csv(partial_url+'/credit_score.csv')
df=df.drop('CUST_ID', axis=1)
mcl=40000 #mean cost of life in the U.S.A.
df['LIFESTYLE']=(df['INCOME']+df['SAVINGS']-df['T_EXPENDITURE_12'])/mcl
display(df.columns)

Index(['INCOME', 'SAVINGS', 'DEBT', 'R_SAVINGS_INCOME', 'R_DEBT_INCOME',
       'R_DEBT_SAVINGS', 'T_CLOTHING_12', 'T_CLOTHING_6', 'R_CLOTHING',
       'R_CLOTHING_INCOME', 'R_CLOTHING_SAVINGS', 'R_CLOTHING_DEBT',
       'T_EDUCATION_12', 'T_EDUCATION_6', 'R_EDUCATION', 'R_EDUCATION_INCOME',
       'R_EDUCATION_SAVINGS', 'R_EDUCATION_DEBT', 'T_ENTERTAINMENT_12',
       'T_ENTERTAINMENT_6', 'R_ENTERTAINMENT', 'R_ENTERTAINMENT_INCOME',
       'R_ENTERTAINMENT_SAVINGS', 'R_ENTERTAINMENT_DEBT', 'T_FINES_12',
       'T_FINES_6', 'R_FINES', 'R_FINES_INCOME', 'R_FINES_SAVINGS',
       'R_FINES_DEBT', 'T_GAMBLING_12', 'T_GAMBLING_6', 'R_GAMBLING',
       'R_GAMBLING_INCOME', 'R_GAMBLING_SAVINGS', 'R_GAMBLING_DEBT',
       'T_GROCERIES_12', 'T_GROCERIES_6', 'R_GROCERIES', 'R_GROCERIES_INCOME',
       'R_GROCERIES_SAVINGS', 'R_GROCERIES_DEBT', 'T_HEALTH_12', 'T_HEALTH_6',
       'R_HEALTH', 'R_HEALTH_INCOME', 'R_HEALTH_SAVINGS', 'R_HEALTH_DEBT',
       'T_HOUSING_12', 'T_HOUSING_6', 'R_HOUSING',

In [3]:
#Deleting non useful columns
transactions=['GROCERIES', 'CLOTHING', 'HOUSING', 'EDUCATION', 'HEALTH', 'TRAVEL', 'ENTERTAINMENT', 'GAMBLING', 'UTILITIES', 'TAX', 'FINES', 'EXPENDITURE']
for el in transactions:
    label='T_'+el+'_6'
    ratio='R_'+el
    df=df.drop(label, axis=1)
    df=df.drop(ratio, axis=1)
df=df.drop(['R_SAVINGS_INCOME', 'R_DEBT_INCOME', 'R_DEBT_SAVINGS', 'R_CLOTHING_INCOME', 'R_CLOTHING_SAVINGS', 'R_CLOTHING_DEBT', 
         'R_EDUCATION_INCOME', 'R_EDUCATION_SAVINGS', 'R_EDUCATION_DEBT', 'R_ENTERTAINMENT_INCOME', 'R_ENTERTAINMENT_SAVINGS',
         'R_ENTERTAINMENT_DEBT', 'R_FINES_INCOME', 'R_FINES_SAVINGS', 'R_FINES_DEBT', 'R_GROCERIES_INCOME', 'R_GROCERIES_SAVINGS',
         'R_GROCERIES_DEBT', 'R_HEALTH_INCOME', 'R_HEALTH_SAVINGS', 'R_HEALTH_DEBT', 'R_HOUSING_INCOME', 'R_HOUSING_SAVINGS',
         'R_HOUSING_DEBT', 'R_TAX_INCOME', 'R_TAX_SAVINGS', 'R_TAX_DEBT', 'R_TRAVEL_INCOME', 'R_TRAVEL_SAVINGS', 'R_TRAVEL_DEBT',
         'R_UTILITIES_INCOME', 'R_UTILITIES_SAVINGS', 'R_UTILITIES_DEBT', 'T_GAMBLING_12', 'R_GAMBLING_INCOME', 'R_GAMBLING_SAVINGS',
         'R_GAMBLING_DEBT', 'T_TAX_12', 'T_FINES_12', 'CAT_DEBT','R_EXPENDITURE_INCOME', 'R_EXPENDITURE_SAVINGS',
         'R_EXPENDITURE_DEBT','T_UTILITIES_12','T_EXPENDITURE_12'], axis=1)

<p>After dropping the columns considered not relevant in the network is necessary to map the values discretizing for every column.<br>We will consider some data as reference to define the ranges, trying to mantain a fair ammount of elements for range to reduce errors due to few informations.<p>

In [4]:
def map_credit_score(value):
    if value<580:
        return 'Poor'
    if value<670:
        return 'Fair'
    if value<740:
        return 'Good'
    else:
        return 'Very Good'
def map_savings(value):
    if value==0:
        return 'Zero'
    if value<300000:
        return 'Very Low'
    if value<600000:
        return 'Low'
    if value<1200000:
        return 'High'
    else:
        return 'Very High'
def map_income(value):
    if value==0:
        return 'Zero'
    if value<40000:
        return 'Very Low'
    if value<100000:
        return 'Medium'
    if value<200000:
        return 'High'
    else:
        return 'Very High'
def map_debt(value):
    if value==0:
        return 'Zero'
    if value<50000:
        return 'Very Low'
    if value<200000:
        return 'Medium'
    if value<750000:
        return 'High'
    else:
        return 'Very High'    
def map_groceries(value):
    if value<4000:
        return 'Very Low'
    if value<12000:
        return 'Medium'
    if value<20000:
        return 'High'
    else:
        return 'Very High'
def map_clothing(value):
    if value<1000:
        return 'Low'
    if value<5000:
        return 'Medium'
    if value<10000:
        return 'High'
    else:
        return 'Very High'
def map_education(value):
    if value==0:
        return 'Zero'
    if value<2000:
        return 'Low'
    if value<6000:
        return 'Medium'
    if value<12000:
        return 'High'
    else:
        return 'Very High'
def map_entertainment(value):
    if value==0:
        return 'Zero'
    if value<3000:
        return 'Low'
    if value<6000:
        return 'Medium'
    if value<18000:
        return 'High'
    else:
        return 'Very High'
def map_health(value):
    if value==0:
        return 'Zero'
    if value<3000:
        return 'Low'
    if value<6000:
        return 'Medium'
    if value<12000:
        return 'High'
    else:
        return 'Very High'
def map_housing(value):
    if value==0:
        return 'Zero'
    if value<3000:
        return 'Low'
    if value<6000:
        return 'Medium'
    if value<18000:
        return 'High'
    else:
        return 'Very High'
def map_travel(value):
    if value==0:
        return 'Zero'
    if value<10000:
        return 'Low'
    if value<20000:
        return 'Medium'
    if value<40000:
        return 'High'
    else:
        return 'Very High'
def map_lifestyle(value):
    if value==0:
        return 'Zero'
    if value<0:
        return 'Abysmal'
    if value<1:
        return 'Very Risky'
    if value<5:
        return 'Risky'
    if value<10:
        return 'Careful'
    else:
        return 'Very Careful'

<h1>Columns values</h1>

<h3>Credit Score</h3>
<ul>
<li>'Poor' -> Credit Score from 0 to 580</li>
<li>'Fair' -> Credit Score from 581 to 670</li>
<li>'Good' -> Credit Score from 671 to 740</li>
<li>'Very Good' -> Credit Score from 741</li>
</ul>
<h3>Savings</h3>
<ul>
<li>'Zero' -> No Savings</li>
<li>'Very Low' -> less than 300000$ in savings</li>
<li>'Low' -> savings betwwen 300001$ and 600000$</li>
<li>'High' -> savings between 600001$ and 1200000$</li>
<li>'Very High' -> more than 1200000$ in savings</li>
</ul>
<h3>Income</h3>
<ul>
<li>'Zero' -> No Income</li>
<li>'Very Low' -> Income less than 40000$</li>
<li>'Medium' -> Income between 40001$ and 100000$</li> 
<li>'High' -> Income between 100001$ and 200000$</li>
<li>'Very High' -> Income more than 200000$</li>
</ul>
<h3>Debt</h3>
<ul>
<li>'Zero' -> No Debt</li>
<li>'Very Low' -> Debt less than 50000$</li>
<li>'Medium' -> Debt between 50001$ and 200000$</li> 
<li>'High' -> Debt between 200001$ and 750000$</li>
<li>'Very High' -> Debt more than 750000$</li>
</ul>
<h3>Groceries</h3>
<ul>
<li>'Very Low' -> Expenses related to groceries less than 4000$</li>
<li>'Medium' -> Expenses related to groceries between 4001$ and 12000$</li> 
<li>'High' -> Expenses related to groceries between 12001$ and 20000$</li>
<li>'Very High' -> Expenses related to groceries more than 20000$</li>
</ul>
<h3>Clothing</h3>
<ul>
<li>'Low' -> Expenses related to clothing less than 1000$</li>
<li>'Medium' -> Expenses related to clothing between 1001$ and 5000$</li> 
<li>'High' -> Expenses related to clothing between 5001$ and 10000$</li>
<li>'Very High' -> Expenses related to clothing more than 10000$</li>
</ul>
<h3>Education</h3>
<ul>
<li>'Zero' -> No expenses related to education</li>
<li>'Very Low' -> Expenses related to education less than 2000$</li>
<li>'Medium' -> Expenses related to education between 2001$ and 6000$</li> 
<li>'High' -> Expenses related to education between 6001$ and 12000$</li>
<li>'Very High' -> Expenses related to education more than 12000$</li>
</ul>
<h3>Entertainment</h3>
<ul>
<li>'Zero' -> No expenses related to entertainment</li>
<li>'Very Low' -> Expenses related to entertainment less than 3000$</li>
<li>'Medium' -> Expenses related to entertainment between 3001$ and 6000$</li> 
<li>'High' -> Expenses related to entertainment between 6001$ and 18000$</li>
<li>'Very High' -> Expenses related to entertainment more than 18000$</li>
</ul>
<h3>Health</h3>
<ul>
<li>'Zero' -> No expenses related to health</li>
<li>'Very Low' -> Expenses related to health less than 3000$</li>
<li>'Medium' -> Expenses related to health between 3001$ and 6000$</li> 
<li>'High' -> Expenses related to health between 6001$ and 12000$</li>
<li>'Very High' -> Expenses related to health more than 12000$</li>
</ul>
<h3>Housing</h3>
<ul>
<li>'Zero' -> No expenses related to housing</li>
<li>'Very Low' -> Expenses related to housing less than 3000$</li>
<li>'Medium' -> Expenses related to housing between 3001$ and 6000$</li> 
<li>'High' -> Expenses related to housing between 6001$ and 18000$</li>
<li>'Very High' -> Expenses related to housing more than 18000$</li>
</ul>
<h3>Travel</h3>
<ul>
<li>'Zero' -> No expenses related to travelling</li>
<li>'Very Low' -> Expenses related to travelling less than 10000$</li>
<li>'Medium' -> Expenses related to travelling between 10001$ and 20000$</li> 
<li>'High' -> Expenses related to travelling between 20001$ and 40000$</li>
<li>'Very High' -> Expenses related to travelling more than 40000$</li>
</ul>
<h3>Lifestyle</h3>
<ul>
<li>'Zero' -> The indiviudal has no money after the expenses</li>
<li>'Abysmal' -> The individual is spending more than what they have</li>
<li>'Very Risky' -> The individual has very little money after subtracting the expenses</li> 
<li>'Risky' -> The individual has little money after subtracting the expenses</li>
<li>'Careful' -> The individual has a good margin subtracting the expenses</li>
<li>'Very Careful' -> The individual has a very good margin subtracting the expenses</li>
</ul>

<p>The other features are not modified from the kaggle dataset<p>

<h3>Gambling</h3>
<ul>
<li>'None' -> No gambling expenses</li>
<li>'Low' -> Few gambling expenses</li>
<li>'High' -> A lot of gambling expenses</li>
</ul> 
<h3>Credit_Card</h3>
<ul>
<li>1 if the customer has a credit card, 0 otherwise</li>
</ul>
<h3>Mortgage</h3>
<ul>
<li>1 if the customer has a mortgage, 0 otherwise</li>
</ul>
<h3>Sav_accounts</h3>
<ul>
<li>1 if the customer has a savings account, 0 otherwise</li>
</ul>
<h3>Dependents</h3>
<ul>
<li>1 if the customer has dependents, 0 otherwise</li>
</ul>

In [5]:
df['SAVINGS']=df['SAVINGS'].map(map_savings)
df['CREDIT_SCORE']=df['CREDIT_SCORE'].map(map_credit_score)
df['INCOME']=df['INCOME'].map(map_income)
df['DEBT']=df['DEBT'].map(map_debt)
df['T_GROCERIES_12']=df['T_GROCERIES_12'].map(map_groceries)
df['T_CLOTHING_12']=df['T_CLOTHING_12'].map(map_clothing)
df['T_EDUCATION_12']=df['T_EDUCATION_12'].map(map_education)
df['T_ENTERTAINMENT_12']=df['T_ENTERTAINMENT_12'].map(map_entertainment)
df['T_HEALTH_12']=df['T_HEALTH_12'].map(map_housing)
df['T_HOUSING_12']=df['T_HOUSING_12'].map(map_housing)
df['T_TRAVEL_12']=df['T_TRAVEL_12'].map(map_travel)
df['LIFESTYLE']=df['LIFESTYLE'].map(map_lifestyle)

In [6]:
df.rename(columns={'T_CLOTHING_12':'CLOTHING', 'T_EDUCATION_12':'EDUCATION', 'T_ENTERTAINMENT_12':'ENTERTAINMENT', 'T_GROCERIES_12':'GROCERIES', 'T_HEALTH_12':'HEALTH',
           'T_HOUSING_12':'HOUSING', 'T_TRAVEL_12':'TRAVEL', 'CAT_GAMBLING':'GAMBLING', 'CAT_CREDIT_CARD':'CREDIT_CARD', 'CAT_MORTGAGE': 'MORTGAGE',
           'CAT_SAVINGS_ACCOUNT':'SAV_ACCOUNT', 'CAT_DEPENDENTS':'DEPENDENTS'}, inplace=True)

In [7]:
import numpy as np

edges=[('INCOME','SAVINGS'),('SAV_ACCOUNT','SAVINGS'),
                        ('SAVINGS','TRAVEL'),('SAVINGS','HOUSING'),('SAVINGS','HEALTH'),('SAVINGS','EDUCATION'),
                        ('SAVINGS','GROCERIES'),('SAVINGS','CLOTHING'),('SAVINGS','ENTERTAINMENT'),
                        ('INCOME','TRAVEL'),('INCOME','HOUSING'),('INCOME','HEALTH'),('INCOME','EDUCATION'),
                        ('INCOME','GROCERIES'),('INCOME','CLOTHING'),('INCOME','ENTERTAINMENT'),
                        ('DEBT','TRAVEL'),('DEBT','HOUSING'),('DEBT','HEALTH'),('DEBT','EDUCATION'),
                        ('DEBT','GROCERIES'),('DEBT','CLOTHING'),('DEBT','ENTERTAINMENT'),
                        ('DEPENDENTS','EDUCATION'),('DEPENDENTS','GROCERIES'),('DEPENDENTS','CLOTHING'),
                        ('TRAVEL','LIFESTYLE'),('HOUSING','LIFESTYLE'),('HEALTH','LIFESTYLE'),('EDUCATION','LIFESTYLE'),
                        ('GROCERIES','LIFESTYLE'),('CLOTHING','LIFESTYLE'),('ENTERTAINMENT','LIFESTYLE'),
                        ('CREDIT_CARD','CREDIT_SCORE'),('LIFESTYLE','CREDIT_SCORE'),('DEBT','CREDIT_SCORE'),('GAMBLING','CREDIT_SCORE'),
                        ('MORTGAGE','CREDIT_SCORE'),('DEPENDENTS','CREDIT_SCORE'),
                        ('CREDIT_CARD','DEFAULT'),('LIFESTYLE','DEFAULT'),('DEBT','DEFAULT'),('GAMBLING','DEFAULT'),
                        ('MORTGAGE','DEFAULT'),('DEPENDENTS','DEFAULT')]

network=models.BayesianNetwork(edges)

<h3>Network Structure</h3>
<p>The Network is structured as follows:<br>
Income, Debt and Savings account are not influenced by anything in the network, instead Savings is influenced by the income a person earn and if the person has or not a Savings account. The other variables not influenced by anything are the gambling predisposition, owning a Credit Card, the Mortgage and having depentents.<br>
Instead the expenses are all influenced by the money a person can use and the debt they have, and the expenes influence the lifestyle of a person. The lifestyle, togheter with some of the non influenced factors determine the Credit Score and the Default of the costumer.</p>

In [8]:
network.fit(df)

from pgmpy.factors.discrete.CPD import TabularCPD
from contextlib import redirect_stdout

def print_full(cpd):
    backup = TabularCPD._truncate_strtable
    TabularCPD._truncate_strtable = lambda self, x: x
    print(cpd)
    TabularCPD._truncate_strtable = backup

with open('CPDs.txt', 'w') as f:
    with redirect_stdout(f):
         for cpd in network.get_cpds():
            print(f'CPT of {cpd.variable}')
            print_full(cpd)

<p>The CPTs are generated in a file to prevent a non comprehensible output for the notebook, this strategy will be used for the tabular descriptions utilized in the following queries</p>

In [9]:
#Defining functions to execute queries with Variable Elimination
def varElim(target, evidence, mode='Query'):
    infer = VariableElimination(network)
    if mode=='Query':
        prob = infer.query([target], evidence=evidence, show_progress=False)
    elif mode=='Map':
        prob = infer.map_query([target], evidence=evidence, show_progress=False)
    else:
        return 'Error'
    print('Probability of {} given {}'.format(target, evidence))
    print(prob)
    print()

<h3>Query 1</h3>
<p>In the first query we want to understand the probability of the output variables considering the disposable money of a costumer.<br>The tabular results are stored in the respective file.</p>

In [10]:
#Inferring Credit Score and Default considering high savings values and high income values
with open('Query1.txt', 'w') as f:
    with redirect_stdout(f):
        varElim('DEFAULT', {'SAVINGS':'High', 'INCOME':'High'})
        varElim('CREDIT_SCORE', {'SAVINGS':'High', 'INCOME':'High'})

        print('\n-----------------------------------------\n')

        varElim('DEFAULT', {'SAVINGS':'Very High', 'INCOME':'Very High'})
        varElim('CREDIT_SCORE', {'SAVINGS':'Very High', 'INCOME':'Very High'})

<p>Looking at the results we can observe that the probability of DEFAULT being 1 are much less in the case of very high income and savings, respecting common belief, meaning in a really rational way that the more money a person can dispose of the more probable is for that person to repay their debt.<br> Observing instead the Credit Score tables we see a really similar distribution, we can observe a little difference, quite obvious, in the 'Poor' and 'Fair' distributions, where the customers with more disposable money have an higher probability of having a 'Fair' Credit Score.</p>

<h3>Query 2</h3>
<p>In this query we want to determine the most probable values of the variables Income, Savings and Debt to obtain a positive result (the costumer did not declare default and has a optimal credit score).<br> the results are shown below.</p>

In [11]:
#Inferring Savings, Debt and Income considering a very good Credit Score and not Default values
varElim('SAVINGS', {'CREDIT_SCORE':'Very Good', 'DEFAULT':0}, mode='Map')
print()
varElim('INCOME', {'CREDIT_SCORE':'Very Good', 'DEFAULT':0}, mode='Map')
print()
varElim('DEBT', {'CREDIT_SCORE':'Very Good', 'DEFAULT':0}, mode='Map')

Probability of SAVINGS given {'CREDIT_SCORE': 'Very Good', 'DEFAULT': 0}
{'SAVINGS': 'Very Low'}


Probability of INCOME given {'CREDIT_SCORE': 'Very Good', 'DEFAULT': 0}
{'INCOME': 'Very Low'}


Probability of DEBT given {'CREDIT_SCORE': 'Very Good', 'DEFAULT': 0}
{'DEBT': 'Very High'}



<p>In this case the results reguarding the debt can be considered alligned with what we can expect because typically a lot of debt is associated with lots of different kinds of credit a person have, for example a good credit score considers martgages and other factors that usually cause debt to begin with.<br>The results reguarding savings and income are peculiar, that could be caused by a skewing of the data, in fact the data are not well balanced and this could influence the probabilities obtained by the fitting of the model.<p>

<h3>Query 3</h3>
<p>In this query we want to analize the effects of gambling when the lifestyle of a person is extremely expensive, showing the difference in the probability distribution of the credit score changing the value of the variable.<br>The tabular results are stored in the respective file.</p>

In [12]:
#Inferring Credit Score considering a very risky lifestyle but no gambling or the same lifestyle with a low or high ammount of money spent in gambling
with open('Query3.txt', 'w') as f:
    with redirect_stdout(f):
        varElim('CREDIT_SCORE', {'LIFESTYLE':'Very Risky', 'GAMBLING':'High'})
        print()
        varElim('CREDIT_SCORE', {'LIFESTYLE':'Very Risky', 'GAMBLING':'Low'})
        print()
        varElim('CREDIT_SCORE', {'LIFESTYLE':'Very Risky', 'GAMBLING':'No'})

<p>The results of this query show that in case of extreme lifestyle gambling seems to be considered positively. The explanation could be that, considering that the total expenses are extremely high with respect to the disposable money, in such cases gambling could cause a win, and an increase of disposable money, not possible in other cases. Anyway from the probability distribution we can see that the better assignment is the 'Low' value, meaning that a great expense in gambling is penalized if compared to a moderate one.</p>

<h3>Query 4</h3>
<p>In the fourth query we want to analize the changing in the credit score probability distribution considering if a costumer has a mortgage and/or dependents.<br>The tabular results are stored in the respective file.</p>

In [13]:
#Inferring Credit Score considering if the person has a mortgage and dependents
with open('Query4.txt', 'w') as f:
    with redirect_stdout(f):
        varElim('CREDIT_SCORE', {'MORTGAGE':0, 'DEPENDENTS':0})
        print()
        varElim('CREDIT_SCORE', {'MORTGAGE':1, 'DEPENDENTS':1})
        print()
        varElim('CREDIT_SCORE', {'MORTGAGE':0, 'DEPENDENTS':1})
        print()
        varElim('CREDIT_SCORE', {'MORTGAGE':1, 'DEPENDENTS':0})

<p>These results show that having assets is considered very positively for the Credit Score, in fact it is quite evident that the worst situation is the one with neither the mortgage or dependents.</p>

<h3>Query 5</h3>
<p>In the fifth query we observe the most probable assignements for the expenses considered the average american income.<br>The results are shown below.</p>

In [14]:
#Some other interesting inferences
#Inferences considering the mean american income (60.000$)
print('Considering the mean america income (60.000$):') 
varElim('GROCERIES', {'INCOME':'Medium'}, mode='Map') 
print()
varElim('HEALTH', {'INCOME':'Medium'}, mode='Map')
print() 
varElim('HOUSING', {'INCOME':'Medium'}, mode='Map') 
print()
varElim('TRAVEL', {'INCOME':'Medium'}, mode='Map')
print()
varElim('CLOTHING', {'INCOME':'Medium'}, mode='Map')
print()
varElim('EDUCATION', {'INCOME':'Medium'}, mode='Map')
print()
varElim('ENTERTAINMENT', {'INCOME':'Medium'}, mode='Map')

Considering the mean america income (60.000$):
Probability of GROCERIES given {'INCOME': 'Medium'}
{'GROCERIES': 'Medium'}


Probability of HEALTH given {'INCOME': 'Medium'}
{'HEALTH': 'Low'}


Probability of HOUSING given {'INCOME': 'Medium'}
{'HOUSING': 'Zero'}


Probability of TRAVEL given {'INCOME': 'Medium'}
{'TRAVEL': 'High'}


Probability of CLOTHING given {'INCOME': 'Medium'}
{'CLOTHING': 'Medium'}


Probability of EDUCATION given {'INCOME': 'Medium'}
{'EDUCATION': 'Zero'}


Probability of ENTERTAINMENT given {'INCOME': 'Medium'}
{'ENTERTAINMENT': 'High'}



<h3>Query 6</h3>
<p>In the sixth query we want to observe how having a savings account influences the default of a costumer.<br>The tabular results are shown below becuase the output is small.</p>

In [15]:
#Inferring default considering if the person has a savings account or not
varElim('DEFAULT', {'SAV_ACCOUNT':1})
varElim('DEFAULT', {'SAV_ACCOUNT':0})

Probability of DEFAULT given {'SAV_ACCOUNT': 1}
+------------+----------------+
| DEFAULT    |   phi(DEFAULT) |
| DEFAULT(0) |         0.5980 |
+------------+----------------+
| DEFAULT(1) |         0.4020 |
+------------+----------------+

Probability of DEFAULT given {'SAV_ACCOUNT': 0}
+------------+----------------+
| DEFAULT    |   phi(DEFAULT) |
| DEFAULT(0) |         0.5741 |
+------------+----------------+
| DEFAULT(1) |         0.4259 |
+------------+----------------+



<h3>Query 7</h3>
<p>In the last query we want to observe how the CREDIT_CARD value influences the default  and credit score of a costumer.<br>The tabular results are stored in the respective file.</p>

In [16]:
#Inferring Default and Credit Score considering if a person has a credit card
with open('Query7.txt', 'w') as f:
    with redirect_stdout(f):
        varElim('CREDIT_SCORE', {'CREDIT_CARD':0})
        print()
        varElim('CREDIT_SCORE', {'CREDIT_CARD':1})

<p>The last query offers a really interesting observation: we can actually notice that, as we expected, the Credit Score is highly inlfuenced by the CREDIT_CARD variable.<br>The Credit Card value 1 offers in fact a distribution where the probabilities of the high and low values ('Very High', 'High' and 'Poor') increse. That is due to the use of the credit cards, in fact having a credit card and using it properly will increase your Credit Score, but maxing out the credit limit or having similar behaviour will instead decrease your final Credit Score.</p>

<h3>Conclusions</h3>
<p>Looking at the results of the queries we can observe that the network is giving results quite alligned with the expectations, despite some results are not what we would expect. The network could be made better considering more evenly distributed data (in some cases where present only 2 elements, generating a non accurate fitting) and discussing the model with an expert, to refine the network and his edges and to have a precise insight of the causal links between variables. Also this network wants to define causal links between elements that influence the credit score, but these are not the only ones that determine it and expanding the network to consider more elements could improve the effectiveness significantly.</p>