In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from google.colab import drive

drive.mount('/content/drive')

from sklearn.preprocessing import LabelEncoder

Mounted at /content/drive


## Document information
* The bbc.docs file will give us the information about all the articles
* The bbc.terms file will get the list of terms
* The bbc.mtx file to obtain the term frequencies for each article
*Example would be row 812(2, 528, 5.0) which indicates term 2 "sale occures 5 times in article 528 (entertainment.018).
* If you check the terms file the second term is sale, and if you check the docs file the 528th article is entertainment.018.

# Finding the input X

* The input X will be a matrix where the rows represent each article and the columns will represent each term.
* Each i,j in the matrix will be the frequency/binary of the term within the article


In [2]:
def process_input_data():
  file_path = "/content/drive/My Drive/CSCC11/A2/bbc/bbc.mtx"
  df = pd.read_csv(file_path)

  with open(file_path) as f:
      lines = [line.rstrip() for line in f]

  lines_without_first_two_rows = lines[2:]

  split_values = [line.split() for line in lines_without_first_two_rows]

  split_values = [[int(term_id), int(article_id), float(frequency)] for term_id, article_id, frequency in split_values]

  # 9635 terms and 2225 articles
  unique_terms = set(term_id for term_id, _, _ in split_values)
  unique_articles = set(article_id for _, article_id, _ in split_values)

  article_term_matrix = np.zeros((len(unique_articles), len(unique_terms)))

  df_matrix = pd.DataFrame(article_term_matrix, index=sorted(unique_articles), columns=sorted(unique_terms))

  for term_id, article_id, frequency in split_values:
      df_matrix.at[article_id, term_id] = frequency

  X = np.array(df_matrix)

  return X

# Finding labels Y
* Y will be the corresponding label to each row in the matrix
* We encode the labels to numeric values
* Business: 0, entertainment: 1, politics: 2, sports: 3, tech: 4

In [3]:
def process_input_labels():
  file_path_labels = "/content/drive/My Drive/CSCC11/A2/bbc/bbc.docs"
  with open(file_path_labels) as f_labels:
      article_categories = [line.strip().split('.')[0] for line in f_labels]

  Y = np.array(article_categories)

  label_encoder = LabelEncoder()
  numeric_Y = label_encoder.fit_transform(Y)

  return numeric_Y

# Partition Data

In [4]:
def process_data(isNB):

  X = process_input_data()
  Y = process_input_labels()

  if(isNB):
    X = (X > 0).astype(int)

  X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3, random_state=42)

  return (X_train, X_test, Y_train, Y_test)


# Part a)

# Calculate Prior
* The prior probabilities is the probability of the category.
* For example, the probability of a category being business will be: $\frac{\text{the number of business articles}}{\text{the number of articles}}$


# Calculate Conditionals
* We consider the binary values in the case of Naive Bayes.
* I.e if the article contains the term the value be 1 and 0 otherwirse.
* We want to check how often a term appears in a article category.
* For example: $\frac{\text{the number of business articles that contain the term "sale"}}{\text{the number of business articles}}$
* In the case that conditional probability is 0, we will generlize and add 1 to the numerator and 2 to the denominator.
## Code Explanation
* We create a dictionary where the 5 catergories are the keys, and the values will be an array of the conditional probabilities of each term.
* We do by using a boolean mask, True where the label Y is equal to the current category and False otherwise.
* I.e it essentially selects the rows in the binary term frequency matrix that correspond to the articles in the current category.
* We use this so the next line calculates the sum of binary values for each term across all articles in the current category. Allowing us to use the formula showed earlier in the example above

In [5]:
def calculate_probabilities(X_train, Y_train):

  #Priors

  unique_categories, category_counts = np.unique(Y_train, return_counts=True)
  prior_probs = category_counts / len(Y_train)

  # Priors probability check
  # for category, prior_prob in zip(unique_categories, prior_probs):
  #   print(f"Category: {category}, Prior Probability: {prior_prob:.4f}")

  #Frequencies

  term_probs_by_category = {c: np.zeros(X_train.shape[1]) for c in unique_categories}

  for i, category in enumerate(unique_categories):
      category_mask = (Y_train == category)
      term_probs_by_category[category] = (X_train[category_mask].sum(axis=0) + 1) / (category_counts[i] + 2)

  return prior_probs, term_probs_by_category


# Prediction

In [6]:
def predict_naive_bayes(X_test, prior_probs, term_probs_by_category):
    predictions = []

    for i in range(X_test.shape[0]):
        article_probs = []

        for idx, category in enumerate(term_probs_by_category):
            prior_prob = np.log(prior_probs[idx])

            # We take the conditional probability
            # If the term is present in the article (boolean mask 1) we take that conditional probability
            # We do the similar thing if the term is not present in article and the probability of that happening
            conditional_prob = np.sum(np.log(term_probs_by_category[category][X_test[i] == 1])) + \
                               np.sum(np.log(1 - term_probs_by_category[category][X_test[i] == 0]))

            article_prob = prior_prob + conditional_prob
            article_probs.append(article_prob)

        predicted_category = np.argmax(article_probs)
        predictions.append(predicted_category)

    return predictions

# Training Accuracy

In [7]:
def calculate_accuracy(actual_labels, predicted_labels):
    correct_predictions = np.sum(actual_labels == predicted_labels)
    total_predictions = len(actual_labels)
    accuracy = correct_predictions / total_predictions * 100
    return accuracy

# Trial and Training Accuracy

In [8]:
X_train, X_test, Y_train, Y_test = process_data(True)

prior_probs, term_probs_by_category = calculate_probabilities(X_train, Y_train)

actual_train_labels = Y_train
actual_test_labels = Y_test

predicted_train_labels = predict_naive_bayes(X_train, prior_probs, term_probs_by_category)
predicted_test_labels = predict_naive_bayes(X_test, prior_probs, term_probs_by_category)

train_accuracy = calculate_accuracy(actual_train_labels, predicted_train_labels)
print(f"Training Accuracy: {train_accuracy:.2f}%")

test_accuracy = calculate_accuracy(actual_test_labels, predicted_test_labels)
print(f"Testing Accuracy: {test_accuracy:.2f}%")


Training Accuracy: 98.84%
Testing Accuracy: 95.96%


# Part b) Gaussian Naive Bayes

# Finding the mean and variance for each
* For Gaussian Naive Bayes we want to find the mean and variance for each term in each category.
* We do this to help us find the Gaussian Distribution associated with each term for each category.
* To do this we create a dictionary for each category containing a matrix of articles by terms where each i,j contains the associated frequency.
* This allows us to then calculate the mean and variance more easily.
* Note: if the variance is 0, we make it to $e^{-9}$ as suggested.

In [9]:
def calculate_mean_variance_by_category(X_train, Y_train):
    unique_categories, _ = np.unique(Y_train, return_counts=True)

    term_stats_by_category = {c: {'mean': np.zeros(X_train.shape[1]), 'variance': np.zeros(X_train.shape[1])} for c in unique_categories}

    for category in unique_categories:
        category_mask = (Y_train == category)
        term_stats_by_category[category]['mean'] = np.mean(X_train[category_mask], axis=0)
        term_stats_by_category[category]['variance'] = np.var(X_train[category_mask], axis=0)

        term_stats_by_category[category]['variance'][term_stats_by_category[category]['variance'] == 0] = 1e-9

    return term_stats_by_category

# Gaussian Classifier
* Using the means and variances obtained from the previous function we can now classify new data.
* To classify data we will use the prior probabilities multipled the conditional probabilities (obtained from the gaussians).
* We will also want to take the log to handle underflow, so we will now be summing the logs.
* We caculate this for each category and in the end, the article with the highest value will be our classification for the article.

In [10]:
def predict_gaussian(X_test, prior_probs, term_stats_by_category):
    predictions = []

    for i in range(X_test.shape[0]):
        category_probs = []

        for category, stats in term_stats_by_category.items():
            prior_prob = np.log(prior_probs[category])

            log_likelihoods = -(np.log(2 * np.pi * stats['variance']) +
                                      ((X_test[i] - stats['mean'])**2) / (2 * stats['variance']))

            likelihood_sum = np.sum(log_likelihoods)
            category_prob = prior_prob + likelihood_sum
            category_probs.append(category_prob)

        predicted_category = list(term_stats_by_category.keys())[np.argmax(category_probs)]
        predictions.append(predicted_category)

    return predictions

# Trial and Training Accuracy

In [11]:
X_train, X_test, Y_train, Y_test = process_data(False)


prior_probs, term_probs_by_category = calculate_probabilities(X_train, Y_train)
term_stats_by_category = calculate_mean_variance_by_category(X_train, Y_train)

actual_train_labels = Y_train
actual_test_labels = Y_test

predicted_train_labels = predict_gaussian(X_train, prior_probs, term_stats_by_category)
predicted_test_labels = predict_gaussian(X_test, prior_probs, term_stats_by_category)

train_accuracy = calculate_accuracy(actual_train_labels, predicted_train_labels)
print(f"Training Accuracy: {train_accuracy:.2f}%")

test_accuracy = calculate_accuracy(actual_test_labels, predicted_test_labels)
print(f"Testing Accuracy: {test_accuracy:.2f}%")

Training Accuracy: 100.00%
Testing Accuracy: 92.07%
