In [1]:
import pandas as pd
import math
import re

In [2]:
q2_data = {
    'sentence': ['fun couple love love', 
                 'fast furious shoot', 
                 'couple fly fast fun fun', 
                 'furious shoot shoot fun', 
                 'fly fast shoot love',
                ],
    'class': ['comedy',
              'action',
              'comedy', 
              'action',
              'action',
             ]
    
}
q2_train = pd.DataFrame(q2_data)

reviews_data = {
    'sentence': ["I am very satisfied with my purchase of the Echo Dot 5th generation! The sound is much better than I expected, and the audio quality is impressive for its size. Alexa responds quickly to my commands and helps me with daily tasks, from playing music to controlling my smart devices. Additionally, its design is sleek and easy to integrate into any room. The seller was very quick with shipping, and the product arrived in perfect condition. I will definitely recommend both the product and the seller! Thank you for a great shopping experience.",
                 "This is the best investment that I have made. the sound is perfect. I can automate my apartment using alexa. My only complaint is the light they provided with. It is just a mediocre light. But, the fact that I got both at $22 is the most fantastic steal. Would definitely recommend!",
                 "I love both I have almost one year using they work perfectly",
                 "Works great with Alexa. Fun colors. Still figuring out how to connect it to other household members phones. The bulb, not echo.",
                 "I purchased the echo and Smart bulb on a Christmas deal for a great price. I was not looking for a lightbulb, but it was included with the purchase. However, I do love the new lightbulb. My son thinks it’s great that we can change the colors of the lightbulb, I also love that it is adjustable to different levels depending on the needed brightness. As always, I love my Alexa echo and have synced them throughout the house. The speaker is amazing and Alexa is receptive to voice commands at all times. Even my toddler loves to talk to Alexa! I have an Alexa in almost every room at this point, and I’m not sure how we lived without her before! The combination of the echo and the lightbulb is perfect and I am so glad I was able to get both of these products for a great price.",
                 "Isn't work.. the first month work perfectly but now I have to unplug and plug in again to make work for cuplé times and after a need to disconnected and connect again from the power and leave reset all the time I want to use.. how I can fix this",
                 "I don't appreciate them naming two different products the same name. If it says 5th generation there shouldn't be TWO 5th generations. One w a clock display & one without a clock displace. Name them something different ?! Now I'm suck with one I dont want. Frankly that's pretty stupid.",
                 "Got this Echo Dot to run routines triggered when room is occupied or unoccupied. Previously had Echo Flex with motion sensors which worked great. Assumed these more expensive and newer models would work at least as well. They do not! Have tried adjusting the sensitivity and distance settings to no avail. Rarely does the Echo Dot detect my motion to trigger my routines although sometimes the cat triggers it in the middle of the night. Extremely frustrating and disappointing.",
                ],
    'class': ['positive',
              'positive',
              'positive',
              'positive',
              'positive',
              'negative',
              'negative',
              'negative',
             ]
}
reviews_train = pd.DataFrame(reviews_data)

In [3]:
def generateClassProbs(train_df):
    class_counts = {}
    for class_name in train_df['class']:
        if not class_name in class_counts:
            class_counts[class_name] = 0
        class_counts[class_name] += 1
    class_probs = {name: math.log(count / len(train_df['class'])) for name, count in class_counts.items()}
    
    return class_probs

def generateUniqueWordsByDoc(train_df):
    unique_words_by_doc = {class_name: list() for class_name in train_df['class']}
    for index in range(len(train_df)):
        gold_class = train_df['class'][index]
        unique_words_by_doc[gold_class].append(set(train_df['sentence'][index].split()))
        
    return unique_words_by_doc

def generateBinarizedCounts(train_df, uniq_words_doc):
    binarized_counts = {key: dict() for key in uniq_words_doc.keys()}
    for key in uniq_words_doc.keys():
        list_of_words = uniq_words_doc[key]
        for word_set in list_of_words:
            for word in word_set:
                if not word in binarized_counts[key]:
                    binarized_counts[key][word] = 0
                binarized_counts[key][word] += 1
                
    return binarized_counts

def getVocabularySize(bin_counts):
    class_vocab_size = {}
    for classification in bin_counts.keys():
        class_vocab_size[classification] = 0
        for count in bin_counts[classification].values():
            class_vocab_size[classification] += count
            
    return class_vocab_size

def negateSentences(train_df):
    negated_sentences = []
    
    for sentence in train_df['sentence']:
        negative_word = False
        negated_review = []
        for token in sentence.split():
            if negative_word:
                token = 'NOT_'+token
                if any(punc in token for punc in ('!', '.', '?', ';', ':', ',')):
                    negative_word = False
            if any(negator in token for negator in ("n't", 'not', 'no', 'never', "n’t", 'dont', 'wont', 'cant', 'isnt')):
                negative_word = True
            negated_review.append(token)
        negated_sentence = ' '.join(negated_review)
        negated_sentences.append(negated_sentence)
    train_df['sentence'] = negated_sentences

    return train_df

In [4]:
def nieveBays(document, train_df):
    class_probs = generateClassProbs(train_df)
    unique_words_by_doc = generateUniqueWordsByDoc(train_df)
    binarized_counts = generateBinarizedCounts(train_df, unique_words_by_doc)
    vocab_size_by_class = getVocabularySize(binarized_counts)
    train_df = negateSentences(train_df)

    new_tok = document.split()
    vocab_size = len(new_tok)
    post_probs = {key: 0 for key in binarized_counts.keys()}
    for classification in post_probs.keys():
        probs_list = []
        for word in new_tok:
            if word in binarized_counts[classification]:
                prob = (binarized_counts[classification][word] + 1) / (vocab_size_by_class[classification] + vocab_size)
            else:
                prob = 1 / (vocab_size_by_class[classification] + vocab_size)
            probs_list.append(math.log(prob))
        post_probs[classification] = sum(probs_list) + class_probs[classification]
    
    return  (max(post_probs, key=post_probs.get), post_probs)

In [5]:
new_doc = 'fast couple shoot fly'
predicted_class, log_posterier_probs = nieveBays(new_doc, q2_data)
print(f'The predicted class is {predicted_class}.\nThe log posterier probabilities are: {log_posterier_probs}.')

The predicted class is action.
The log posterier probabilities are: {'comedy': -8.006784147535463, 'action': -6.9081718588673535}.


In [6]:
new_review = """I still want to return this but I've been out of town and missed the window for returns. Let me tell you about this: it would never ever hold a radio station. The chat bot would tell me again and again to restore factory settings. I did that half a dozen times and still had the same problem. So I plugged in my old Alexa and voila! No problem dropping the radio station. In fact I noticed the sound was better and the cord is longer. I'd rather keep what I've got and send the blue blob right back to Amazon."""
predicted_class, log_poster_probs = nieveBays(new_review, reviews_train)
print(f'The predicted class is {predicted_class}.\nThe log posterier probabilities are: {log_poster_probs}.')

The predicted class is negative.
The log posterier probabilities are: {'positive': -541.4312451028625, 'negative': -520.3253505424128}.
