<a href="https://colab.research.google.com/github/AdrianaCaetano/AI/blob/main/Assignment3/571AI_Assing3_AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Assignment 3

CS 571 AI - Prof Gutta

Author Adriana Caetano - Spring 2022


In [None]:
import csv

def read_file(file_name, train_size):
    ''' 
    Read dataset from file into train and test datasets
    '''
    count = 0;
    train = []
    test = []

    with open(file_name, newline='', mode ='r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            #print(row)
            if (count < train_size): 
                #first rows are for training
                train.append(row)

            else:
                test.append(row)
            
            count += 1
    
    return train, test


In [None]:
def prior_probability(dataset):
    '''
    Compute prior probability for spam and ham for training dataset
    '''
    total = len(dataset)
    total_spam = 0
    total_ham = 0

    for row in dataset:
        if (row['Target'] == 'spam'):
            total_spam += 1
        else:
            total_ham += 1

    # P(spam) = number spam / total docs
    prior_spam = total_spam / total

    # P(ham) = number ham / total docs
    prior_ham = total_ham / total

    return prior_spam, prior_ham

In [None]:
def posterior_probability(word_count, unique_words, total_words):
    '''
    Compute the probability of a word to appear in a document
    Apply lapace smothing, adding 1 to the numerator and the total number of 
    unique words to the denominator
    P(given word | positive) = (# occurrences of given word + 1) / (# total words on positive subset + total unique words of training dataset)
    '''
    p = (word_count + 1) / unique_words + total_words

    return p


# Apply lapace smothing, adding 1 to the numerator and the total number of unique words to the denominator
# P(word | spam) = number of times 'word' occur in spam / total words in spam docs

# P(word | ham) = number of times 'word' occur in ham / total words in ham docs



In [None]:
def naive_bayes(train, message):
    '''
    Compute the probability of each message to be spam or ham
    P(spam | message) = P(message | spam) x P(spam) / P(message)
    P(ham | message) = P(message | ham) x P (ham) / P(message)
    if P(spam | message) > P(ham | message), return spam
    otherwise, return ham
    '''
    # Step 1: Compute Prior Probabilities
    prior_spam, prior_ham = prior_probability(train)
    #print ("Prior_spam: P(spam) =", prior_spam)
    #print ("Prior_ham: P(ham) =", prior_ham)

    # Step 2: Build a dictionary for word frequency (count) from training dataset
    spam_dict = dict()
    ham_dict = dict()

    total_spam = 0
    total_ham = 0

    for n in range(0, len(train)):
    
        if train[n]['Target'] == 'spam':
            #print("spam")
            for word in train[n]['\tdata'].lower().split(): 
                total_spam += 1
                #print(word)
                if word is spam_dict:
                    spam_dict[word] += 1
                else: 
                    spam_dict[word] = {}
                    spam_dict[word] = 1
        else:
            # it's not a spam message
            #print("ham")
            for word in train[n]['\tdata'].lower().split():  
                    total_ham += 1
                    #print(word)
                    if word is ham_dict:
                        ham_dict[word] += 1
                    else: 
                        ham_dict[word] = {}
                        ham_dict[word] = 1

    total_unique_spam = len(spam_dict)
    total_unique_ham = len(ham_dict)
      
    #print("Spam words [total, unique]: [", total_spam, ",", total_unique_spam, "]")
    #print("Ham words [total, unique]: [", total_ham,",", total_unique_ham,"]")


    # Setp 3: Compute Posterior Probabilities
    # update values in the dictionary to be the probability of each word instead of the count
    for key,value in spam_dict.items():
        key = posterior_probability(value, total_unique_spam, total_spam)

    for key,value in ham_dict.items():
        key = posterior_probability(value, total_unique_ham, total_ham)

    # Compute the probabily of each word in the message to be spam or ham
    # P(cause|e1, e2, e3 ... eN) =  P(e1|c) * P(e2|c) * ... * P(eN|c) * P(cause) / P(e1)*P(e2) * P(e3) ... P(eN)
    p_spam = prior_spam   # P(spam | message) = P (word1|spam) * P(word2|spam) * ... * P(spam)  -> do not need denominator
    p_ham = prior_ham   # P( ham | message) = P (word1|ham) * P(word2|ham) * ... * P(ham) -> do not need denominator
    for word in message.lower().split():
        # print(word)
        # check if word is in the dict
        if word not in spam_dict:
            p_spam *= posterior_probability(0,total_unique_spam, total_spam)
        else:
            p_spam *= spam_dict[word]
        
        if word not in ham_dict:
            p_ham *= posterior_probability(0,total_unique_ham, total_ham)
        else:
            p_ham *= ham_dict[word]

    # because we'll compare both probabilities that have the same denominator, we did not compute 
    if p_spam < p_ham:
        return 'spam'
    else:
        return 'ham'


In [None]:
# 1: Read datasets from files
train, test = read_file('spam_detection.csv', 20)

# 2: Apply Naive Bayes to categorize new messages as spam/ham
accurate = 0 # counter to compute accuracy of the classification

# Loop through test messages to classify each message using naive_bayes
for n in range(0,len(test)):    
    result = naive_bayes(train, test[n]['\tdata'])
    #print(test[n]['Target'], 'vs', result)
    if (result == test[n]['Target']):
        accurate += 1

accuracy = (accurate/len(test))*100

print('Accuracy:', accuracy , "%")


Accuracy: 100.0 %
