In [1]:
import random
from collections import defaultdict
import pprint

In [2]:
def parse_data(file_name: str) -> list[list]:
    data = []
    file = open(file_name, "r")
    for line in file:
        datum = line.rstrip().split(",")
        data.append(datum)
    random.shuffle(data)
    return data

You can use this function to create 10 folds for 5x2 cross validation.

In [3]:
def create_folds(xs: list, n: int) -> list[list[list]]:
    k, m = divmod(len(xs), n)
    # be careful of generators...
    return list(xs[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

In [4]:
test_data = [
    ['p', 'x', 's', 'n', 't', 'p', 'f', 'c', 'n', 'k', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u'],
    ['e', 'x', 's', 'y', 't', 'a', 'f', 'c', 'b', 'k', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 'n', 'g'],
    ['e', 'b', 's', 'w', 't', 'l', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 'n', 'm'],
    ['p', 'x', 'y', 'w', 't', 'p', 'f', 'c', 'n', 'n', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'u'],
    ['e', 'x', 's', 'g', 'f', 'n', 'f', 'w', 'b', 'k', 't', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'e', 'n', 'a', 'g'],
    ['e', 'x', 'y', 'y', 't', 'a', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 'n', 'g'],
    ['e', 'b', 's', 'w', 't', 'a', 'f', 'c', 'b', 'g', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 'n', 'm'],
    ['e', 'b', 'y', 'w', 't', 'l', 'f', 'c', 'b', 'n', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'n', 's', 'm'],
    ['p', 'x', 'y', 'w', 't', 'p', 'f', 'c', 'n', 'p', 'e', 'e', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 'v', 'g'],
    ['e', 'b', 's', 'y', 't', 'a', 'f', 'c', 'b', 'g', 'e', 'c', 's', 's', 'w', 'w', 'p', 'w', 'o', 'p', 'k', 's', 'm']
]

## initialize_dictionary_with_defaultdict

This function initializes a dictionary to hold feature counts using nested `defaultdict` structures for each feature index. It sets up each feature index to map to a dictionary, facilitating easy updates for feature value counts.

### Args:
* **dictionary** : A dictionary intended to store feature counts, where each feature index maps to a nested `defaultdict`.
* **num_features** : An integer representing the number of features in each observation.

### Returns:
* **dictionary** : The input dictionary, modified to include each feature index as a key, mapped to a nested `defaultdict` that will store feature value counts for each label.


In [5]:
def initialize_dictionary_with_defaultdict(dictionary, num_features):
    for i in range(1, num_features + 1):
        dictionary[i] = defaultdict(dict)
    return dictionary

In [6]:
feature_dict = initialize_dictionary_with_defaultdict({}, 3)
assert all(isinstance(feature_dict[i], defaultdict) for i in range(1, 4))
assert len(initialize_dictionary_with_defaultdict({}, 5)) == 5

## count_feature_occurrences

This function counts the occurrences of each feature value given a label across all observations in the training data. It updates the feature counts dictionary with these values, which are used for conditional probability calculations in a Naive Bayes Classifier.

### Args:
* **training_data** : A list of lists where each inner list represents an observation. The first element in each row is the label, followed by feature values.
* **feature_counts** : A dictionary of dictionaries where each feature index maps to a dictionary of feature values and their counts for each label. This dictionary is updated in place.
* **num_features** : An integer representing the number of features in each observation.

### Returns:
* **feature_counts** : A dictionary of dictionaries updated with the counts of each feature value for each label, where each feature index maps to a dictionary of feature values and their respective label counts.


In [7]:
def count_feature_occurrences(training_data, feature_counts, num_features):
    for row in training_data:
        label = row[0]
        for i in range(1, num_features + 1):
            feature_value = row[i]
            if label not in feature_counts[i][feature_value]:
                feature_counts[i][feature_value][label] = 0
            feature_counts[i][feature_value][label] += 1
    return feature_counts

In [8]:
num_features = len(test_data[0]) - 1
feature_counts = defaultdict(lambda: defaultdict(dict))
result_counts = count_feature_occurrences(test_data, feature_counts, num_features)
assert result_counts[1]['x']['p'] == 3
assert result_counts[2]['s']['e']== 5
assert result_counts[2].get('z', {}).get('p', 0) == 0

## get_unique_values

This function extracts the unique values for a specified feature index across all observations in the training data. It is useful for setting up data structures or applying smoothing for a Naive Bayes Classifier.

### Args:
* **training_data** : A list of lists where each inner list represents an observation.
* **feature_index** : An integer specifying the index of the feature for which unique values are needed.

### Returns:
* **unique_values** : A set containing all unique values for the specified feature index in the training data.


In [9]:
def get_unique_values(training_data, feature_index):
    unique_values = set()
    for row in training_data:
        unique_values.add(row[feature_index])
    return unique_values

In [10]:
assert get_unique_values([['A', 1], ['B', 2], ['A', 3]], 1) == {1, 2, 3}
assert get_unique_values([['A', 1], ['B', 1], ['A', 2]], 1) == {1, 2}
assert get_unique_values([['A', 1], ['B', 1], ['A', 1]], 1) == {1}

## apply_smoothing

This function applies Laplace smoothing to the feature counts, ensuring that each feature value has an initial count for each label, even if it does not appear in the training data. This helps avoid zero probabilities in the Naive Bayes calculations.

### Args:
* **training_data** : A list of lists where each inner list represents an observation. The first element in each row is the label, followed by feature values.
* **feature_counts** : A dictionary of dictionaries where each feature index maps to a dictionary of feature values and their counts for each label.
* **unique_labels** : A set of unique labels present in the training data.
* **num_features** : An integer representing the number of features in each observation.

### Returns:
* This function does not return a value. It modifies `feature_counts` in place by adding a count of 1 for each feature value and label combination that does not already exist.


In [11]:
def apply_smoothing(training_data, feature_counts, unique_labels, num_features):
    for i in range(1, num_features + 1):
        unique_values = get_unique_values(training_data, i)
        for value in unique_values:
            for label in unique_labels:
                if label not in feature_counts[i][value]:
                    feature_counts[i][value][label] = 1

In [12]:
num_features = len(test_data[0]) - 1
feature_counts = defaultdict(lambda: defaultdict(dict))
unique_labels = {'p', 'e'}
apply_smoothing(test_data, feature_counts, unique_labels, num_features)
assert feature_counts[1]['x']['p'] == 1
assert feature_counts[2]['s']['e'] == 1 and feature_counts[2]['s']['p'] == 1
assert feature_counts[3]['y'].get('e', 1) == 1

## get_unique_labels

This function extracts the unique labels from the training data, which is useful for initializing data structures or applying smoothing in a Naive Bayes Classifier.

### Args:
* **training_data** : A list of lists where each inner list represents an observation. The first element in each row is the label.

### Returns:
* **unique_labels** : A set containing all unique labels in the training data.


In [13]:
def get_unique_labels(training_data):
    unique_labels = set()
    for row in training_data:
        unique_labels.add(row[0])
    return unique_labels

In [14]:
assert get_unique_labels([['A', 1], ['B', 2], ['A', 3]]) == {'A', 'B'} 
assert get_unique_labels([['A', 1], ['A', 2]]) == {'A'}
assert len(get_unique_labels([['A'], ['B'], ['B']])) == 2

## initialize_feature_counts

This function initializes the feature counts dictionary for each feature in the training data. It optionally applies Laplace smoothing to handle unseen feature values for each label.

### Args:
* **training_data** : A list of lists where each inner list represents an observation. The first element in each row is the label, followed by feature values.
* **num_features** : An integer representing the number of features in each observation.
* **smoothing** : A boolean flag indicating whether to apply Laplace smoothing to the feature counts.

### Returns:
* **feature_counts** : A dictionary of dictionaries where each feature index maps to a dictionary of feature values, which are initialized for each label.


In [15]:
def initialize_feature_counts(training_data, num_features, smoothing):
    feature_counts = defaultdict(dict)
    feature_counts = initialize_dictionary_with_defaultdict(feature_counts, num_features)
    if smoothing:
        unique_labels = get_unique_labels(training_data)
        apply_smoothing(training_data, feature_counts, unique_labels, num_features)
    return feature_counts

In [16]:
assert isinstance(initialize_feature_counts(test_data, len(test_data[0]) - 1, smoothing=True), defaultdict)
feature_counts = initialize_feature_counts(test_data, len(test_data[0]) - 1, smoothing=True)
assert len(feature_counts) == len(test_data[0]) - 1
assert all(isinstance(feature_counts[i], dict) for i in range(1, len(test_data[0])))

## count_label_occurrences

This function counts the occurrences of each label in the training data, helping to determine the prior probability of each label for a Naive Bayes Classifier.

### Args:
* **training_data** : A list of lists where each inner list represents an observation. The first element in each row is the label.

### Returns:
* **label_counts** : A dictionary where each key is a label, and the corresponding value is the count of that label in the training data.


In [17]:
def count_label_occurrences(training_data):
    label_counts = defaultdict(int)
    for row in training_data:
        label_counts[row[0]] += 1
    return label_counts

In [18]:
assert count_label_occurrences(test_data)['p'] == 3
assert set(count_label_occurrences(test_data).keys()) == {'p', 'e'}
assert count_label_occurrences(test_data).get('x', 0) == 0

## count_occurrences

This function counts the occurrences of labels and feature values within the training data, which are essential for calculating probabilities in the Naive Bayes Classifier. It initializes feature counts and counts occurrences for each feature value given a label.

### Args:
* **training_data** : A list of lists where each inner list represents an observation. The first element in each row is the label, followed by feature values.
* **num_features** : An integer representing the number of features in each observation.
* **smoothing** : A boolean flag indicating whether to apply Laplace smoothing when initializing feature counts.

### Returns:
* **label_counts** : A dictionary containing the count of each label in the training data.
* **feature_counts** : A dictionary of dictionaries where each feature index maps to a dictionary of feature values and their counts for each label.


In [19]:
def count_occurrences(training_data, num_features, smoothing):
    label_counts = count_label_occurrences(training_data)
    feature_counts = initialize_feature_counts(training_data, num_features, smoothing)
    feature_counts = count_feature_occurrences(training_data, feature_counts, num_features)
    return label_counts, feature_counts

## train

This function trains a Naive Bayes Classifier (NBC) on a given dataset, calculating probabilities for each label and feature value. It supports optional Laplace smoothing to handle unseen feature values.

### Args:
* **training_data** : A list of lists where each inner list represents an observation. The first element in each row is the label, and the subsequent elements are feature values.
* **smoothing** : A boolean flag indicating whether to apply Laplace smoothing. Default is `True`.

### Returns:
* **classifier** : A dictionary representing the trained Naive Bayes Classifier. For each label, it contains:
  - **'probability'** : The prior probability of the label.
  - **'features'** : A dictionary where each feature index maps to a dictionary of feature values and their respective conditional probabilities given the label.


In [20]:
def train(training_data, smoothing=True):
    feature_counts = defaultdict(dict)
    num_features = len(training_data[0]) - 1
    label_counts, feature_counts = count_occurrences(training_data, num_features, smoothing)
    classifier = {}
    total_samples = sum(label_counts.values())
    for label in label_counts:
        label_prob = label_counts[label] / total_samples
        classifier[label] = {'probability': label_prob, 'features': {}}
        for feature_index in range(1, num_features + 1):
            classifier[label]['features'][feature_index] = {}
            feature_values = feature_counts[feature_index]
            for feature_value, label_count in feature_values.items():
                count = label_count.get(label, 1) if smoothing else label_count.get(label, 0)
                smoothed_total = label_counts[label] + len(feature_values) if smoothing else label_counts[label]
                classifier[label]['features'][feature_index][feature_value] = count / smoothed_total
    return classifier

## remove_rows_with_missing_values

This function removes rows containing missing values (represented by `"?"`) from the dataset.

### Args:
* **data** : A list of lists, where each inner list represents a row of data. Each row may contain one or more values, with `"?"` representing a missing value.

### Returns:
* **cleaned_data** : A list of lists containing only rows without any missing values.


In [21]:
def remove_rows_with_missing_values(data):
    cleaned_data = []
    for row in data: 
        if "?" not in row: 
            cleaned_data.append(row)
    return cleaned_data

In [22]:
data = parse_data("agaricus-lepiota-1.data")
cleaned_data = remove_rows_with_missing_values(data)

In [23]:
bayes_classifier = train(cleaned_data)

In [24]:
pprint.pprint(bayes_classifier)

{'e': {'features': {1: {'b': 0.0746994848311391,
                        'c': 0.00028620492272467084,
                        'f': 0.41814539210074414,
                        'k': 0.006010303377218088,
                        's': 0.009444762449914138,
                        'x': 0.49141385231825985},
                    2: {'f': 0.40578465063001146,
                        'g': 0.000286368843069874,
                        's': 0.20418098510882016,
                        'y': 0.3897479954180985},
                    3: {'b': 0.0002860411899313501,
                        'c': 0.009439359267734555,
                        'e': 0.16504576659038903,
                        'g': 0.2542906178489702,
                        'n': 0.2931922196796339,
                        'p': 0.002574370709382151,
                        'w': 0.1604691075514874,
                        'y': 0.1147025171624714},
                    4: {'f': 0.266189111747851, 't': 0.733810888252149},
                    

## classify

This function classifies a set of observations using a pre-trained Naive Bayes Classifier (NBC). For each observation, it calculates the probability score for each possible label based on the features and selects the label with the highest score. If the labeled flag is set to `True`, it includes the actual label in the results for comparison.

### Args:
* **nbc** : A dictionary representing the trained Naive Bayes Classifier. Each key is a label, and each value is a dictionary containing:
  - **'probability'** : The prior probability of the label.
  - **'features'** : A dictionary where each feature index maps to a dictionary of feature values and their respective probabilities.
* **observations** : A list of lists, where each inner list represents an observation. The first element of each observation is the actual label (if labeled).
* **labeled** : A boolean flag indicating whether the actual label is included in the results. Default is `True`.

### Returns:
* **results** : A list of predictions. If labeled is `True`, each element is a tuple containing the predicted label and the actual label; otherwise, each element is the predicted label alone.


In [25]:
def classify(nbc, observations, labeled=True):
    results = []
    for observation in observations:
        label_scores = {}
        for label in nbc:
            score = nbc[label]['probability']
            for i in range(1, len(observation)):
                feature_value = observation[i]
                feature_probs = nbc[label]['features'][i]
                score *= feature_probs.get(feature_value, 1.0)
            label_scores[label] = score
        predicted_label = max(label_scores, key=label_scores.get)
        if labeled:
            results.append((predicted_label, observation[0]))
        else:
            results.append(predicted_label)
    return results

## evaluate

This function calculates the error rate of a model's predictions by comparing the predicted labels to the actual labels in the labeled dataset.

### Args:
* **predicted_labels** : A list of lists where each inner list contains the predicted label for a given observation.
* **labeled_data** : A list of lists representing the labeled data, where the first element of each inner list is the actual label.

### Returns:
* **error_rate** : A float representing the proportion of incorrectly predicted labels, calculated as the number of errors divided by the total number of observations.


In [26]:
def evaluate(predicted_labels, labeled_data):
    errors = 0
    n = len(labeled_data)
    for i in range(n):
        actual_label = labeled_data[i][0]
        predicted_label = predicted_labels[i][0]
        if predicted_label != actual_label:
            errors += 1
    error_rate = errors / n
    return error_rate

In [27]:
results = classify(bayes_classifier,cleaned_data,labeled=True)
error_rate = evaluate(cleaned_data,results)

In [28]:
print(error_rate)

0.02303330970942594


## train_and_evaluate

This function trains a model using the provided training function, then classifies the test data using the provided classification function, and finally evaluates the model's performance using the provided evaluation function. It returns the error rate on the test data.

### Args:
* **train_fn** : A function used to train the model, taking the training data as input.
* **classify_fn** : A function used to classify observations, taking the model and the test data as inputs.
* **evaluate_fn** : A function used to evaluate the model's performance, taking the predicted labels and labeled test data as inputs.
* **train_fold** : A list of lists representing the training data.
* **test_fold** : A list of lists representing the test data. The first element of each observation is the actual label.

### Returns: 
* **returns test_error** The error rate of the model on the test data.


In [29]:
def train_and_evaluate(train_fn, classify_fn, evaluate_fn, train_fold, test_fold):
    model = train_fn(train_fold)
    predicted_test_labels = classify_fn(model, test_fold, labeled=True)
    test_error = evaluate_fn(predicted_test_labels, test_fold)
    return test_error

## calculate_and_print_mean

This function calculates the mean test error rate across multiple folds and prints the result with four decimal places of precision.

### Args:
* **total_test_error_rate** : The sum of test error rates from all folds.
* **num_folds** : The number of folds used in the evaluation.

### Returns:
* **None** : This function does not return any value. It prints the mean test error rate to the console.


In [30]:
def calculate_and_print_mean(total_test_error_rate, num_folds):
    mean_test_error_rate = total_test_error_rate / num_folds
    print(f"Mean = {mean_test_error_rate:.4f}")

## cross_validate

This function performs cross-validation on the provided dataset by splitting the data into 10 folds. It trains and evaluates a model for each pair of consecutive folds, printing the error rate for each pair. It also calculates and prints the mean test error rate across all folds.

### Args:
* **data** : List of lists, where each inner list represents an observation in the dataset. The first element of each observation is the label.
* **train_fn** : A function used to train the model, taking the training data as input.
* **classify_fn** : A function used to classify observations, taking the model and the test data as inputs.
* **evaluate_fn** : A function used to evaluate the model's performance, taking the predicted labels and labeled test data as inputs.

### Returns:
* **None** : This function does not return any value. It prints the error rates for each pair of folds and the mean test error rate across all folds.


In [31]:
def cross_validate(data, train_fn, classify_fn, evaluate_fn):    
    total_test_error_rate = 0
    folds = create_folds(data, 10)
    print("Train   Test")
    for i in range(0, 10, 2):
        fold_train = folds[i]
        fold_test = folds[i + 1]
        test_error1 = train_and_evaluate(train_fn, classify_fn, evaluate_fn, fold_train, fold_test)
        print(f"Fold {i + 1} -> Fold {i + 2} error rate: {test_error1:.4f}")
        test_error2 = train_and_evaluate(train_fn, classify_fn, evaluate_fn, fold_test, fold_train)
        print(f"Fold {i + 2} -> Fold {i + 1} error rate: {test_error2:.4f}")
        total_test_error_rate += test_error1 + test_error2
    calculate_and_print_mean(total_test_error_rate, 10)

In [32]:
data = parse_data("agaricus-lepiota-1.data")
training_data = remove_rows_with_missing_values(data)
cross_validate(training_data, train, classify, evaluate)

Train   Test
Fold 1 -> Fold 2 error rate: 0.0885
Fold 2 -> Fold 1 error rate: 0.0531
Fold 3 -> Fold 4 error rate: 0.0743
Fold 4 -> Fold 3 error rate: 0.0496
Fold 5 -> Fold 6 error rate: 0.0532
Fold 6 -> Fold 5 error rate: 0.0355
Fold 7 -> Fold 8 error rate: 0.0691
Fold 8 -> Fold 7 error rate: 0.0674
Fold 9 -> Fold 10 error rate: 0.0479
Fold 10 -> Fold 9 error rate: 0.0674
Mean = 0.0606
