In [86]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from sklearn import mixture
import numpy as np
from scipy.stats import norm

In [87]:
data = pd.read_csv('lifeInsurance.txt', header = None, sep= '\s+')
data.columns = ['Gender', 'Age', 'MaritalStatus', 'NumberOfChildren', 'PhysicalStatus', 'ChronicDiseases', 'MonthlySalary', 'Decision']

# Extracting the features and target variable
X = data.drop(columns=["Decision"])
y = data['Decision']

# Separate the discrete and continuous features
discrete_features = ['Gender', 'MaritalStatus', 'NumberOfChildren', 'PhysicalStatus', 'ChronicDiseases']
continuous_features = ['Age', 'MonthlySalary']

# teste.txt example
'''data = pd.read_csv('teste.txt', header = None, sep= '\s+')
data.columns = ['age', 'income', 'student', 'credit', 'Decision']

# Extracting the features and target variable
X = data.drop(columns=["Decision"])
y = data['Decision']

# Separate the discrete and continuous features
discrete_features = ['age', 'income', 'student', 'credit']
continuous_features = []'''

'data = pd.read_csv(\'teste.txt\', header = None, sep= \'\\s+\')\ndata.columns = [\'age\', \'income\', \'student\', \'credit\', \'Decision\']\n\n# Extracting the features and target variable\nX = data.drop(columns=["Decision"])\ny = data[\'Decision\']\n\n# Separate the discrete and continuous features\ndiscrete_features = [\'age\', \'income\', \'student\', \'credit\']\ncontinuous_features = []'

In [88]:
# Prior probabilities
def prior_probabilities(y):
    total = y.shape[0]
    probabilities = {}

    for value in y.unique():
        probabilities[value] = y[y == value].shape[0] / total

    return probabilities

prior_prob = prior_probabilities(y)
print('Prior probabilities:')
for p, value in prior_prob.items():
    print(f'  P({p}) = {value:.4f} %')

Prior probabilities:
  P(0.0) = 0.6783 %
  P(1.0) = 0.3217 %


In [89]:
def likelihood_discrete(prob_discrete, X, y, instance):
    likelihoods = {}

    for decision in y.unique():
        likelihoods[decision] = 1

        for feature in discrete_features:
            value = instance[feature]
            likelihoods[decision] *= prob_discrete[feature][value][decision]

    return likelihoods

# Conditional probabilities for discrete features
def conditional_probabilities_discrete(X, y):
    probabilities = {}

    for feature in discrete_features:
        probabilities[feature] = {}

        for value in X[feature].unique():
            probabilities[feature][value] = {}

            for decision in y.unique():
                subset = X[(X[feature] == value) & (y == decision)]

                num_samples = subset.shape[0]
                num_decision_samples = y[y == decision].shape[0]

                probabilities[feature][value][decision] = num_samples / num_decision_samples if num_decision_samples > 0 else 0

    return probabilities

prob_discrete = conditional_probabilities_discrete(X, y)

print('Conditional probabilities for discrete features:')

for feature, values in prob_discrete.items():
    print(f"  Feature: {feature}")
    for value, decisions in values.items():
        print(f"    Value: {value}")
        for decision, prob in decisions.items():
            print(f"      Decision: {decision}, Probability: {prob:.4f}")
    print()

Conditional probabilities for discrete features:
  Feature: Gender
    Value: 0.0
      Decision: 0.0, Probability: 0.8747
      Decision: 1.0, Probability: 0.8238
    Value: 1.0
      Decision: 0.0, Probability: 0.1253
      Decision: 1.0, Probability: 0.1762

  Feature: MaritalStatus
    Value: 1.0
      Decision: 0.0, Probability: 0.4865
      Decision: 1.0, Probability: 0.6425
    Value: 0.0
      Decision: 0.0, Probability: 0.5135
      Decision: 1.0, Probability: 0.3575

  Feature: NumberOfChildren
    Value: 1.0
      Decision: 0.0, Probability: 0.2776
      Decision: 1.0, Probability: 0.2021
    Value: 3.0
      Decision: 0.0, Probability: 0.1990
      Decision: 1.0, Probability: 0.3679
    Value: 2.0
      Decision: 0.0, Probability: 0.2236
      Decision: 1.0, Probability: 0.2798
    Value: 0.0
      Decision: 0.0, Probability: 0.2998
      Decision: 1.0, Probability: 0.1503

  Feature: PhysicalStatus
    Value: 1.0
      Decision: 0.0, Probability: 0.5356
      Decision: 1.0

In [90]:
# Conditional probabilities for continuous features
def conditional_probabilities_continuous(prob_continuous, X, y, instance):
    probabilities = {}

    for feature in continuous_features:
        probabilities[feature] = {}

        for decision in y.unique():
            mean, std = prob_continuous[feature][decision]
            value = instance[feature]

            # Calculate the probability using the normal distribution
            prob = norm.pdf(value, loc=mean, scale=std)
            probabilities[feature][decision] = prob

    return probabilities

def mean_std_continuous(X, y):
    probabilities = {}

    for feature in continuous_features:
        probabilities[feature] = {}

        for decision in y.unique():
            subset = X[y == decision][feature]
            mean = subset.mean()
            std = subset.std()

            probabilities[feature][decision] = (mean, std)

    return probabilities

print('Mean and standard deviation for continuous features:')
prob_continuous = mean_std_continuous(X, y)
for feature, decisions in prob_continuous.items():
    print(f"  Feature: {feature}")
    for decision, (mean, variance) in decisions.items():
        print(f"    Decision: {decision}, Mean: {mean:.4f}, Variance: {variance:.4f}")
    print()

Mean and standard deviation for continuous features:
  Feature: Age
    Decision: 0.0, Mean: 57.8256, Variance: 20.8255
    Decision: 1.0, Mean: 70.8497, Variance: 20.9822

  Feature: MonthlySalary
    Decision: 0.0, Mean: 2752.8421, Variance: 686.7423
    Decision: 1.0, Mean: 2284.9327, Variance: 584.9082



In [91]:
instance = {
    'Gender': 0,
    'Age': 50,
    'MaritalStatus': 1,
    'NumberOfChildren': 2,
    'PhysicalStatus': 1,
    'ChronicDiseases': 0,
    'MonthlySalary': 3000
}
'''instance = {
    'age': 0,
    'income': 1,
    'student': 1,
    'credit': 0
}'''

likelihoods_discrete = likelihood_discrete(prob_discrete, X, y, instance)
print('Likelihoods for the instance:')
for decision, prob in likelihoods_discrete.items():
    print(f'  P(Decision={decision} | Instance) = {prob:.4f}')
print()

probabilities = conditional_probabilities_continuous(prob_continuous, X, y, instance)
print('Conditional probabilities for continuous features:')
for feature, decisions in probabilities.items():
    print(f"  Feature: {feature}")
    for decision, prob in decisions.items():
        print(f"    Decision: {decision}, Probability: {prob:.4f}")
    print()

Likelihoods for the instance:
  P(Decision=0.0 | Instance) = 0.0319
  P(Decision=1.0 | Instance) = 0.0154

Conditional probabilities for continuous features:
  Feature: Age
    Decision: 0.0, Probability: 0.0179
    Decision: 1.0, Probability: 0.0116

  Feature: MonthlySalary
    Decision: 0.0, Probability: 0.0005
    Decision: 1.0, Probability: 0.0003



In [None]:
# TODO
# fazer multiplicação das probabilidades continuas
# multiplicar com as discretas
# e multiplicar com a probabilidade a priori