In [None]:
import datasets
import pandas as pd

train_ds, test_ds = datasets.load_dataset('dbpedia_14', split=['train', 'test'])
df_train: pd.DataFrame = train_ds.to_pandas()
df_test: pd.DataFrame = test_ds.to_pandas()

In [None]:
df_train = df_train.sample(frac=0.25).reset_index(drop=True)
df_test = df_test.sample(frac=0.25).reset_index(drop=True)

In [None]:
# Your code goes here
from typing import Union, List
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

class NaiveBayesModel:
    
    """Multinomial NB model class template"""
    
    phi: np.ndarray # (N, K)
    
    mu: np.ndarray  # (K,)
    
    vocab: dict     # vocabulary map from word to row index in phi
    
    n_class: int    # number of classes   
    
    DF_Count_T: pd.core.frame.DataFrame   # word count of docs
    
    labels_list: List[int]
    
    
    def __init__(self, vocabulary: dict, num_classes: int, DF_Count_T: pd.core.frame.DataFrame, labels_list: List[int]):
        """
        Parameters
        ----------
        vocabulary: {str: int} <- {word: index}
        num_classes: Number of classes
        """
        self.vocab = vocabulary
        self.n_class = num_classes
        self.mu = np.zeros(shape = (num_classes,))
        N = len(vocabulary.keys())
        self.phi = np.zeros(shape = (N, num_classes))
        self.labels_list = labels_list
        self.DF_Count_T = DF_Count_T
    
    @classmethod
    def from_preprocessed_data(cls, docs_list: List[str], labels_list: List[int]):
        # Extract necessary information from the training set before instantiating an object NaiveBayesModel
        num_classes = len(set(labels_list))
        MyVectCount = CountVectorizer(input='content', stop_words = "english", max_features=800)
        DTM_Count = MyVectCount.fit_transform(docs_list)
        ColumnNames = MyVectCount.get_feature_names()
        DF_Count = pd.DataFrame(DTM_Count.toarray(),columns=ColumnNames)
        DF_Count_T = DF_Count.T
        vocab = dict()
        for i in range(len(DF_Count_T.index)):
            vocab[DF_Count.columns[i]] = i
        return cls(vocab, num_classes, DF_Count_T, labels_list)
    
    def estimate_mu(self, alpha: float = 1.):
        """
        Estimate P(Y), the prior over labels
        
        Parameters
        ----------
        alpha: smoothing parameter
        """
        for label in self.labels_list:
            self.mu[label] += 1
        self.mu = (self.mu + alpha) / (sum(self.mu) + self.n_class * alpha)
        return self.mu
    
    def estimate_phi(self, alpha: float = 1.):
        """
        Estimate phi, the N x K matrix 
        describing the probability of
        the nth word in the kth class.
        
        Parameters
        ----------
        alpha: smoothing parameter
        """
        
        class_index = [[] for i in range(self.n_class)]
        for i in range(len(self.labels_list)):
            class_index[self.labels_list[i]] += [i]
            
        for k in range(self.n_class):
            classk = self.DF_Count_T.iloc[:,class_index[k]]
            #the word count of all words in class k
            classk_sum = classk.apply(lambda x: x.sum(), axis=1)
            for word in classk_sum.index:
                self.phi[self.vocab[word],k] = classk_sum[self.vocab[word]]
        
        self.phi = np.array(pd.DataFrame(self.phi).apply(lambda x: (x+alpha)/(x.sum()+len(x)*alpha), axis=0))
        
        return self.phi
    
    def predict_label(self, text: str) -> int:
        """
        Compute label given some input text
        
        Parameters
        ----------
        text: raw input text
        
        Returns
        -------
        int: corresponding to the predicted label
        """
        words = text.split(" ")
        value = list(self.mu)
        for word in words:
            if word in self.vocab.keys():
                for j in range(self.n_class):
                    value[j] = value[j] * self.phi[self.vocab[word],j]
            else: continue
        return value.index(max(value))

model = NaiveBayesModel.from_preprocessed_data(df_train["content"], df_train["label"])
x_mu = model.estimate_mu()
x_phi = model.estimate_phi()