In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, roc_curve
%matplotlib inline

In [2]:
data = pd.read_csv("comments.tsv", sep = '\t')
texts = data['comment_text'].values
target = data['should_ban'].values
data[50::200]

Unnamed: 0,should_ban,comment_text
50,0,"""Those who're in advantageous positions are th..."
250,1,Fartsalot56 says f**k you motherclucker!!
450,1,"Are you a fool? \n\nI am sorry, but you seem t..."
650,1,I AM NOT A VANDAL!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
850,0,Citing sources\n\nCheck out the Wikipedia:Citi...


In [31]:
texts_train, texts_test, y_train, y_test = train_test_split(texts, target, test_size=0.5, random_state=17)

In [32]:
tokenizer = TweetTokenizer()
preprocess = lambda text: ' '.join(tokenizer.tokenize(text.lower()))

text = 'How to be a grown-up at work: replace "fuck you" with "Ok, great!".'
print("before:", text,)
print("after:", preprocess(text),)

before: How to be a grown-up at work: replace "fuck you" with "Ok, great!".
after: how to be a grown-up at work : replace " fuck you " with " ok , great ! " .


In [33]:
texts_train = [preprocess(text) for text in texts_train]
texts_test = [preprocess(text) for text in texts_test]

In [34]:
vectorizer = CountVectorizer(max_features = 10000)
X_train_bow = vectorizer.fit_transform(texts_train).toarray()
X_test_bow = vectorizer.transform(texts_test).toarray()
vectorizer.get_feature_names_out()

array(['00', '000', '000000', ..., 'ホストクラブ', 'ホスト部', '桜蘭高校ホスト部'],
      dtype=object)

In [35]:
X_train_bow.shape

(500, 5285)

In [40]:
class BinaryNaiveBayes:


    def fit(self, X, y):

        _, counts = np.unique(y, return_counts = True)
        pos_prob = counts[0] / len(y)
        neg_prob = 1 - pos_prob
        # first, compute marginal probabilities of every class, p(y=k) for k = 0,1
        self.p_y = np.array([pos_prob, neg_prob])

        # count occurences of each word in texts with label 1 and label 0 separately
        word_counts_positive = np.ones(X_train_bow.shape[1])
        word_counts_negative = np.ones(X_train_bow.shape[1])

        for i in range(X.shape[0]):
            for j in range(X.shape[1]):
                if y[i] == 0 and X[i][j] != 0:
                    word_counts_positive[j] += 1
                elif y[i] == 1 and X[i][j] != 0:
                    word_counts_negative[j] += 1
        # ^-- both must be vectors of shape [vocab_size].

        # finally, lets use those counts to estimate p(x | y = k) for k = 0, 1

        self.p_x_given_positive = word_counts_positive / len(word_counts_positive)
        self.p_x_given_negative = word_counts_negative / len(word_counts_negative)


        return self

    def predict_scores(self, X):

        # compute scores for positive and negative classes separately.
        # these scores should be proportional to log-probabilities of the respective target {0, 1}
        # note: if you apply logarithm to p_x_given_*, the total log-probability can be written
        # as a dot-product with X
        score_positive = X @ np.log(self.p_x_given_positive[:, np.newaxis]) + np.log(self.p_y[0])
        score_negative = X @ np.log(self.p_x_given_negative[:, np.newaxis]) + np.log(self.p_y[1])

        # you can compute total p(x | y=k) with a dot product
        return np.stack([score_negative, score_positive], axis=-1)

    def predict(self, X):
        return self.predict_scores(X).argmin(axis=-1)

In [41]:
naive_model = BinaryNaiveBayes().fit(X_train_bow, y_train)

In [42]:
roc_auc_score(naive_model.predict(X_test_bow), y_test)

0.7902463412417053