In [1]:
import pandas as pd
import math

In [2]:
q2_data = {
    'sentence': ['fun couple love love', 
                 'fast furious shoot', 
                 'couple fly fast fun fun', 
                 'furious shoot shoot fun', 
                 'fly fast shoot love',
                ],
    'class': ['comedy',
              'action',
              'comedy', 
              'action',
              'action',
             ]
    
}
q2_train = pd.DataFrame(q2_data)

In [3]:
def generateClassProbs(train_df):
    class_counts = {}
    for class_name in train_df['class']:
        if not class_name in class_counts:
            class_counts[class_name] = 0
        class_counts[class_name] += 1
    class_probs = {name: math.log(count / len(train_df['class'])) for name, count in class_counts.items()}
    return class_probs

def generateUniqueWordsByDoc(train_df):
    unique_words_by_doc = {class_name: list() for class_name in train_df['class']}
    for index in range(len(train_df)):
        gold_class = train_df['class'][index]
        unique_words_by_doc[gold_class].append(set(train_df['sentence'][index].split()))
    return unique_words_by_doc

def generateBinarizedCounts(train_df, uniq_words_doc):
    binarized_counts = {key: dict() for key in uniq_words_doc.keys()}
    for key in uniq_words_doc.keys():
        list_of_words = uniq_words_doc[key]
        for word_set in list_of_words:
            for word in word_set:
                if not word in binarized_counts[key]:
                    binarized_counts[key][word] = 0
                binarized_counts[key][word] += 1
    return binarized_counts

def getVocabularySize(bin_counts):
    class_vocab_size = {}
    for classification in bin_counts.keys():
        class_vocab_size[classification] = 0
        for count in bin_counts[classification].values():
            class_vocab_size[classification] += count
            
    return class_vocab_size

In [4]:
def nieveBays(document, train_df):
    class_probs = generateClassProbs(train_df)
    unique_words_by_doc = generateUniqueWordsByDoc(train_df)
    binarized_counts = generateBinarizedCounts(train_df, unique_words_by_doc)
    vocab_size_by_class = getVocabularySize(binarized_counts)

    new_tok = document.split()
    vocab_size = len(new_tok)
    post_probs = {key: 0 for key in binarized_counts.keys()}
    for classification in post_probs.keys():
        probs_list = []
        for word in new_tok:
            if word in binarized_counts[classification]:
                probs_list.append(math.log(1+1)-math.log(vocab_size+vocab_size_by_class[classification]))
            else:
                probs_list.append(-1*math.log(vocab_size+vocab_size_by_class[classification]))
        post_probs[classification] = sum(probs_list) + class_probs[classification]
    
    return  (max(post_probs, key=post_probs.get), post_probs)

In [11]:
new_doc = 'fast couple shoot fly'
predicted_class, log_posterier_probs = nieveBays(new_doc, q2_data)
print(f'The predicted class is {predicted_class}.\nThe log posterier probabilities are: {log_posterier_probs}.')

The predicted class is action.
The log posterier probabilities are: {'comedy': -8.006784147535463, 'action': -6.9081718588673535}.
