# Library and Data Import

In [103]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Fetch the dataset
car_dataset = fetch_ucirepo(id=19)

# Data unpacking
X = car_dataset.data.features
y = car_dataset.data.targets

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

# variable information 
print(car_dataset.variables) 

       name     role         type demographic  \
0    buying  Feature  Categorical        None   
1     maint  Feature  Categorical        None   
2     doors  Feature  Categorical        None   
3   persons  Feature  Categorical        None   
4  lug_boot  Feature  Categorical        None   
5    safety  Feature  Categorical        None   
6     class   Target  Categorical        None   

                                         description units missing_values  
0                                       buying price  None             no  
1                           price of the maintenance  None             no  
2                                    number of doors  None             no  
3              capacity in terms of persons to carry  None             no  
4                           the size of luggage boot  None             no  
5                        estimated safety of the car  None             no  
6  evaulation level (unacceptable, acceptable, go...  None             no  

In [2]:
car_dataset.metadata

{'uci_id': 19,
 'name': 'Car Evaluation',
 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation',
 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv',
 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.',
 'area': 'Other',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 1728,
 'num_features': 6,
 'feature_types': ['Categorical'],
 'demographics': [],
 'target_col': ['class'],
 'index_col': None,
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 1988,
 'last_updated': 'Thu Aug 10 2023',
 'dataset_doi': '10.24432/C5JP48',
 'creators': ['Marko Bohanec'],
 'intro_paper': {'ID': 249,
  'type': 'NATIVE',
  'title': 'Knowledge acquisition and explanation for multi-attribute decision making',
  'authors': 'M. Bohanec, V. Rajkovič',
  'venue': '8th Intl Workshop on Expert Syst

## Data Exploration

In [3]:
print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [4]:
print(f"{X.describe()} \n")

print(X.sample(20))

       buying  maint doors persons lug_boot safety
count    1728   1728  1728    1728     1728   1728
unique      4      4     4       3        3      3
top     vhigh  vhigh     2       2    small    low
freq      432    432   432     576      576    576 

     buying  maint  doors persons lug_boot safety
1010    med   high      3       4    small   high
1188    med    low      2       2    small    low
1175    med    med  5more       4      med   high
1421    low   high      2       4      big   high
539    high  vhigh  5more    more      big   high
230   vhigh    med      2       4      med   high
815    high    low      4       2      med   high
1369    low  vhigh      4    more    small    med
1180    med    med  5more    more    small    med
1395    low  vhigh  5more    more    small    low
27    vhigh  vhigh      3       2    small    low
1416    low   high      2       4      med    low
324   vhigh    low      2       2    small    low
1148    med    med      4       4      med 

In [5]:
df = pd.concat([X,y],axis=1)
df.sample(10)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
62,vhigh,vhigh,4,2,big,high,unacc
1140,med,med,4,2,big,low,unacc
677,high,med,3,2,small,high,unacc
894,med,vhigh,3,2,med,low,unacc
528,high,vhigh,5more,4,big,low,unacc
59,vhigh,vhigh,4,2,med,high,unacc
91,vhigh,vhigh,5more,4,small,med,unacc
192,vhigh,high,5more,2,med,low,unacc
318,vhigh,med,5more,more,med,low,unacc
604,high,high,4,4,small,med,unacc


In [6]:
y['class'].unique()

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

In [65]:
df_train = pd.concat([X_train,y_train],axis=1)
df_train.shape

(1382, 7)

# Naive Bayes algorithm

In a Naives Bayes classifier, one can learn the probability that an outcome happens given the conditions of the feature. Taken the tennis example, when given that the day is sunny (S), temperature is cool (C), humidity is high (H), and wind is strong (St), the desired output is the probability that players come to the court. Using Naive Bayes assumption, one can calculate this probability as 

$$p(Play\_Tennis = Y|Sunny,Cool,High,Strong) = p(Y)\times p(S|Y)\times p(C|Y)\times p(H|Y)\times p(St|Y)$$

The prior probability of Play_Tennis = Y is calculated as the number of yes occurences over the total number of samples. The conditional probabilities (such as $p(S|Y)$) is determined from the given dataset as the number of occurences where Play_Tennis is Yes and Outlook is Sunny over the total number of occurences where Play_Tennis is Yes. 
$$p(Outlook = Sunny|Play\_Tennis=Yes) = \frac{\text{\# S and Y}}{\text{\# Y}}$$


In the car evaluation example, there are three outcomes - unacceptable, acceptable, good, and vgood. The categories of the features were explored in the previous step and reported here
* buying: The buying price has 4 categories - low, med, high, and vhigh
* maint: The maintenance price has the similar 4 categories
* doors: The number of doors has 4 categories - 2, 3, 4, and 5more
* person: The number of passenger capacity has 3 categories - 2, 4, and more
* lug_boot: The size of luggage boot has 3 categories - small, med, and big
* safety: The estimated car safety has 3 categories - low, med, and high


## Functions for determining prior and conditional probabilities 

In [None]:
def count_occurrences(df, conditions):
    
    """
    Counts the number of occurrences in a DataFrame that match given conditions.

    Parameters
    - df: DataFrame to count occurrences from.
    - conditions: Dictionary specifying conditions (e.g., {'column_name': 'value'}).

    Returns:
    - int: The count of occurrences matching the conditions.
    """
    # Start with the entire DataFrame and filter down based on conditions
    filtered_df = df
    for column, value in conditions.items():
        filtered_df = filtered_df[filtered_df[column] == value]
    return len(filtered_df)

print(f"Number of occurences where: \n"
      f"- Buying price is VHIGH: {count_occurrences(df,{'buying':'vhigh'})}\n"
      f"- Class is VGOOD: {count_occurrences(df,{'class':'vgood'})}\n"
      f"- Buying price is VHIGH and class is GOOD: {count_occurrences(df, {'buying': 'vhigh','class':'good'})} \n"
      f"- Buying price is HIGH and class is UNACC: {count_occurrences(df, {'buying': 'vhigh','class':'unacc'})} \n"
      f"- Safety is HIGH and class is UNACC: {count_occurrences(df, {'safety': 'vhigh','class':'unacc'})} \n"
      f"- Doors is 5MORE and class is GOOD: {count_occurrences(df, {'doors': '5more','class':'good'})} \n"
      f"- Maintenance price is HIGH and class is GOOD: {count_occurrences(df, {'maint': 'high','class':'good'})}")

Number of occurences where: 
- Buying price is VHIGH: 432
- Class is VGOOD: 65
- Buying price is VHIGH and class is GOOD: 0 
- Buying price is HIGH and class is UNACC: 360 
- Safety is HIGH and class is UNACC: 0 
- Doors is 5MORE and class is GOOD: 18 
- Maintenance price is HIGH and class is GOOD: 0


In [77]:
df.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'], dtype='object')

In [None]:
def find_prior_prob(df, condition):
    ''' 
    This function finds the prior probability P(a=A) by determining the number of samples in the entire dataset and the number of occurences of A

    Parameters:
    - df: DataFrame to count occurrences from.
    - conditions: Dictionary specifying the condition or the value of event A (e.g. {'buying'}:{'high'})

    Returns:
    - float: The prior probability that even A happens 
    '''
    num_instances = len(df)
    num_cond = count_occurrences(df, condition)
    
    prior_prob = num_cond / num_instances

    return prior_prob

def find_prior_probs(df, label_column):
    '''
    This function finds the prior probability of each class in the dataframe and report them in a dictionary

    Parameters:
    - df (pd dataframe): DataFrame from which the probability is determined
    - label_colum (str): name of the class/label column

    Returns:
    - class_prior (dict): the dictionary containing class (key) and their corresponding prior probabilities (value)
    '''
    classes = df[label_column].unique()                 # List of the classes
    num_instances = len(df)
    class_prior_probs = {}

    for class_value in classes:
        class_count = df[df[label_column]==class_value].shape[0]
        class_prior_probs[str(class_value)] = class_count / num_instances

    return class_prior_probs

def find_cond_prob(df, prior_condition, condition):
    ''' 
    This function finds the conditional probability P(a=A|b=B) by first finding the number of B occurences in the data, then find the occurences of A given that B occured
    
    Parameters:
    - df: DataFrame to count occurrences from.
    - prior_condition: Dictionary specifying the condition or the value of event B (e.g. {'class'}:{'good'})
    - conditions: Dictionary specifying the condition or the value of event A (e.g. {'buying'}:{'high'})


    Returns:
    - float: The conditional probability P(A|B)
    '''
    # Filter down the dataframe to only include the occurences that satisfy the prior condition
    for column, value in prior_condition.items():
        filtered_df = df[df[column] ==  value]
    num_instances = len(filtered_df)

    # Count the number of occurences that satify the condition, given that the prior condition is already satisfied
    # (In other words, count the occurences that satisfy the condition from the filtered dataframe)
    num_cond = count_occurrences(filtered_df,condition)
    
    cond_prob = num_cond/num_instances

    return cond_prob 

def find_cond_probs(df, label_column):
    classes = df[label_column].unique()
    class_prior_probs = find_prior_probs(df,label_column)

    feature_cond_probs = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
    '''This is a nested dictionary of the following structure
        - Level 1 - Class Values: This level has the class values and the next dictionary level as the key-value pair
            - Level 2 - Feature Names: This level of dictionary has the feature name and the next dictionary level as the key-value pair
                - Level 3 - Feature Values: This level of dictionary has the feature value and the conditional probabilities P(feature == feature value | class == class value) as the key-value pair 
    '''

    # Considering the conditional probability feature by feature
    for feature in df.columns:
        # Skip this step if the feature is the label column of the dataframe
        if feature == label_column:
            pass
        
        # Iterate over each class c of the dataframe to find the conditional probabilities of each feature P(feature = value | class)
        for c in classes: 
            df_subset = df[df[label_column]==c]                         # Subset of the dataframe where class has the value c
            feature_count = df_subset[feature].value_counts()           # Get a series with the number of instances of each feature value in the subset

            # Iterate over all the feature values and calculate the conditional probabilities of each value 
            for value, count in feature_count.items():                  
                feature_cond_probs[c][feature][value] = count / len(df_subset)

    return feature_cond_probs

## Evaluating specific conditional probabilities
This section is to test the find_prior_prob() and find_cond_prob() functions, which determine the prior and condition probabilities given a single specific condition

In [124]:
print(f"The prior probability of the class LOW: {find_prior_prob(df_train, {'class':'unacc'})}")

print(f"The probability that buying price is LOW, given that the class is VGOOD is: {find_cond_prob(df_train, {'class':'vgood'}, {'buying':'low'})} \n"
      f"The probability that the class is VGOOD, given that buying price is LOW is: {find_cond_prob(df_train, {'buying':'low'}, {'class':'vgood'})}")

print(f" => From this finding, one can conclude that if the car is VGOOD, the buying price of the car is most likely LOW.\n" 
      f"    However, having a low price does not guarantee that the car is classified as VGOOD since there are other deterrent criteria such as high maintenance price or low safety\n")

print(f"The conditional probability that buying price is LOW, given that the class is VGOOD is: {find_cond_prob(df_train, {'class':'vgood'}, {'buying':'low'})} \n"
      f"The conditional probability that buying price is MED, given that the class is VGOOD is: {find_cond_prob(df_train, {'class':'vgood'}, {'buying':'med'})} \n"
      f"The conditional probability that buying price is HIGH, given that the class is VGOOD is: {find_cond_prob(df_train, {'class':'vgood'}, {'buying':'high'})} \n"
      f"The conditional probability that buying price is VHIGH, given that the class is VGOOD is: {find_cond_prob(df_train, {'class':'vgood'}, {'buying':'vhigh'})}")

print(" => From this finding, one can conclude that if the car is VGOOD, the buying price of the car can only be LOW or MED and never HIGH or VHIGH \n")

print(f"The conditional probability that buying price is LOW, given that the class is UNACC is: {find_cond_prob(df_train, {'class':'unacc'}, {'buying':'low'})} \n"
      f"The conditional probability that buying price is MED, given that the class is UNACC is: {find_cond_prob(df_train, {'class':'unacc'}, {'buying':'med'})} \n"
      f"The conditional probability that buying price is HIGH, given that the class is UNACC is: {find_cond_prob(df_train, {'class':'unacc'}, {'buying':'high'})} \n"
      f"The conditional probability that buying price is VHIGH, given that the class is UNACC is: {find_cond_prob(df_train, {'class':'unacc'}, {'buying':'vhigh'})}")

print(" => From this finding, one can conclude that if the car is UNACC, the buying price can be of any value but more likely HIGH or VHIGH.")

The prior probability of the class LOW: 0.6931982633863966
The probability that buying price is LOW, given that the class is VGOOD is: 0.5849056603773585 
The probability that the class is VGOOD, given that buying price is LOW is: 0.09281437125748503
 => From this finding, one can conclude that if the car is VGOOD, the buying price of the car is most likely LOW.
    However, having a low price does not guarantee that the car is classified as VGOOD since there are other deterrent criteria such as high maintenance price or low safety

The conditional probability that buying price is LOW, given that the class is VGOOD is: 0.5849056603773585 
The conditional probability that buying price is MED, given that the class is VGOOD is: 0.41509433962264153 
The conditional probability that buying price is HIGH, given that the class is VGOOD is: 0.0 
The conditional probability that buying price is VHIGH, given that the class is VGOOD is: 0.0
 => From this finding, one can conclude that if the car 

## Determine the conditional probabilities of all cases
This section shows the result of the functions find_prior_probs() and find_cond_probs(), which determine the prior and conditional probabilities for all classes and all probability 
$$P(\text{feature values}|\text{class values})$$

In [120]:
result = find_prior_probs(df_train,'class')

for key, value in result.items():
    print(f"The prior probability of class = {key} is {value*100:.3f}%")

The prior probability of class = good is 4.197%
The prior probability of class = unacc is 69.320%
The prior probability of class = acc is 22.648%
The prior probability of class = vgood is 3.835%


In [121]:
find_cond_probs(df_train,label_column='class')

defaultdict(<function __main__.find_cond_probs.<locals>.<lambda>()>,
            {'good': defaultdict(<function __main__.find_cond_probs.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'buying': defaultdict(float,
                                      {'low': 0.6379310344827587,
                                       'med': 0.3620689655172414}),
                          'maint': defaultdict(float,
                                      {'low': 0.7068965517241379,
                                       'med': 0.29310344827586204}),
                          'doors': defaultdict(float,
                                      {'5more': 0.29310344827586204,
                                       '4': 0.25862068965517243,
                                       '2': 0.2413793103448276,
                                       '3': 0.20689655172413793}),
                          'persons': defaultdict(float,
                                      {'more': 0.5, '4': 0.5}),
      

In [None]:
def describe_outcome(df, conditions):
    
    """
    Counts the number of occurrences in a DataFrame that match given conditions.

    Parameters:
    - df: DataFrame to count occurrences from.
    - conditions: Keyword arguments specifying conditions (e.g., column_name='value').

    Returns:
    - int: The count of occurrences matching the conditions.
    """
    # Start with the entire DataFrame and filter down based on conditions
    filtered_df = df
    for column, value in conditions.items():
        filtered_df = filtered_df[filtered_df[column] == value]
    return filtered_df.describe()

describe_outcome(df, {'class':'vgood'})
describe_outcome(df, {'buying':'med','maint':'low','doors':'5more','persons':'more','lug_boot':'med','safety':'high'})