# 03 Naive Bayes and Sentiment Classification

## 朴素贝叶斯

### 文本分类

文本分类 (classification) 可以用于 判断垃圾邮件、判断一个文本属于哪个领域（食物 / 电影） 等。

最简单的可以用机器学习中的朴素贝叶斯 (naive bayes, NB) 方法进行文本分类。

In [None]:
from typing import List, Union, Optional, Tuple

from tqdm import tqdm
import numpy as np
import pandas as pd

class NaiveBayes:
    def __init__(self,
            data: List[str],
            labels: List[Union[str, int]],
            vocab: Optional[dict] = None,
            smoothk: float = 1.
        ) -> None:
        """
        Naive Bayes classifier for sentiment analysis.

        Parameters
        --------
        data: List[str]
            The dataset.
        labels: List[Union[str, int]]
            The class name of each datum.
        vocab: Optional[dict]
            Prior vocabulary list. If None, use all the vocabulary in the training data.
        smoothk: float
            Bayes smoothing constant. Defaults to 1 (Laplacian smoothing).
        """
        # map string-like labels to integer indices
        label_names = sorted(list(set(labels)))
        self.label_name2ind = dict((name, ind) for name, ind in zip(label_names, range(len(label_names))))
        self.label_ind2name = dict((ind, name) for name, ind in zip(label_names, range(len(label_names))))

        self.num_class = len(label_names)

        if vocab is None:
            # construct the vocabulary list from the training set if not given
            vocab = set()
            for line in data:
                vocab.update(line.split())
        
        self.log_prob = dict((key, [0 for i in range(self.num_class)]) for key in vocab)
        self.log_prior = [0] * self.num_class
        self.total_words = [0] * self.num_class
        log_prob = self.log_prob
        
        # count frequency
        name2ind = self.label_name2ind
        for line, label in tqdm(zip(data, labels), total = len(data)):
            label = name2ind[label]
            split = line.split()
            words_count = 0
            for word in split:
                box = log_prob.get(word)
                if box is not None:
                    box[label] += 1
                    words_count += 1
            
            self.log_prior[label] += 1
            self.total_words[label] += words_count
        
        log = np.log
        self.log_prior = np.array(self.log_prior)
        self.log_prior = log(self.log_prior) - log(self.log_prior.sum())
    
        # compute log probability
        log_normalizer = log(np.array(self.total_words) + (len(vocab) + 1) * smoothk)
        for key in log_prob.keys():
            log_prob[key] = log(np.array(log_prob[key]) + smoothk) - log_normalizer


    def __getitem__(self, 
            x: Union[str, Tuple[str, Union[str, int]]]
        ) -> Union[np.ndarray, float]:
        """
        Get the smoothed log probability log(P(word|c)) for all c or some c if specified.
        """
        if isinstance(x, str):
            return self.log_prob[x]
        if len(x) == 2:
            s = x[1]
            if isinstance(s, str):
                s = self.label_name2ind[s]
        return self.log_prob[x[0]][s]
    
    def predict(self,
            data: Union[List[str], str],
            need_lower: bool = True,
            verbose: bool = False
        ) -> Union[List[str], str]:
        """
        Make prediction (classification) on data.

        Parameters
        --------
        data: Union[List[str], str]
            The data to make prediction on.
        need_lower: bool
            Whether convert the text to lower case.
        verbose: bool
            If True, display a tqdm bar.
        """
        only_one = isinstance(data, str)
        if only_one:
            data = [data]

        result = []
        verbose = tqdm if verbose else (lambda x: x) 
        for line in verbose(data):
            if need_lower:
                line = line.lower()

            pred = self.log_prior.copy()
            for word in line.split():
                box = self.log_prob.get(word)
                if box is not None:
                    pred += box
                # unknown words contribute to the prob of each class equally
                # so we can just ignore them

            result.append(self.label_ind2name[np.argmax(pred)])
        
        if only_one:
            result = result[0]
        return result

    def test(self, 
            data: List[str],
            true_labels: List[str],
            plot_in_notebook: bool = True,
            **kwargs
        ) -> dict:
        """
        Make test on data given ground truth labels. Return the confusion matrix.
        
        Parameters
        -------
        data: List[str]
            The data to make prediction on.
        true_labels: List[str]
            Ground truth labels for the data.
        plot_in_notebook: bool
            Whether plot the testing result directly in notebook.
            Only takes effect in notebook environment.
        kwargs:
            Other keyword argument passed directly to function 'predict'.
        """
        pred_y = self.predict(data, **kwargs)
        p = self.num_class

        cross_tab = np.zeros((p+1, p+1), dtype = 'int32')
        name2ind = self.label_name2ind
        for pred, truth in zip(pred_y, true_labels):
            cross_tab[name2ind[truth]][name2ind[pred]] += 1
        cross_tab[-1,:] = cross_tab.sum(axis = 0)
        cross_tab[:,-1] = cross_tab.sum(axis = 1)

        metric_funcs = {
            'Accuracy': lambda tp, fp, fn, tn: (tp + tn) / (tp + fp + fn + tn),
            'Precision': lambda tp, fp, fn, tn: tp / (tp + fp),
            'Recall': lambda tp, fp, fn, tn: tp / (tp + fn),
            'F1': lambda tp, fp, fn, tn: 2 * tp / (2 * tp + fp + fn)
        }

        metrics = {'macro': {}, 'micro': {}}
        for key, func in metric_funcs.items():
            # compute macro / micro value for each metric
            s = 0
            for i in range(p):
                # extract the partial contingency table
                tp = cross_tab[i,i]
                fp, fn = cross_tab[-1,i] - tp, cross_tab[i,-1] - tp
                tn = cross_tab[-1,-1] - tp - fp - fn
                s += func(tp, fp, fn, tn)
            metrics['macro'][key] = s / p

            tp_all, fp_all, fn_all, tn_all = 0, 0, 0, 0
            for i in range(p):
                tp = cross_tab[i,i]
                fp, fn = cross_tab[-1,i] - tp, cross_tab[i,-1] - tp
                tn = cross_tab[-1,-1] - tp - fp - fn
                tp_all += tp
                fp_all += fp
                fn_all += fn
                tn_all += tn
            metrics['micro'][key] = func(tp_all, fp_all, fn_all, tn_all)

        label_names = list(name2ind.keys()) + ['Total']
        confusion_matrix = pd.DataFrame(cross_tab, columns = label_names, index = label_names)
        metrics = pd.DataFrame(metrics).T
        acc = (cross_tab[:-1,:-1].flat[::p+1]).sum() / cross_tab[-1,-1]

        if plot_in_notebook:
            try: 
                from IPython.display import display
                get_ipython
                print('Accuracy = %.2f%%\n'%(100. * acc) + '=' * 60)
                print('True \\ Pred')
                display(confusion_matrix)
                display(metrics)
            except:
                plot_in_notebook = False

        return {
            'acc': acc,
            'prediction': pred_y,
            'confusion_matrix': confusion_matrix,
            'metrics': metrics
        }