In [2]:
import csv
from itertools import count
import operator
import math
EPSILON = 0.000001

In [3]:
def standardize(word):
    """
    Standardizes words by:
    1. Converting to lowercase
    2. Removing punctuation
    3. Keeping only alphabetic characters
    """
    standard = word.lower().strip()
    # remove punctuation
    standard = ''.join([i for i in standard if i.isalpha()])
    return standard

In [4]:

def is_stop(word):
    """
    Removes common words that don't help in analysis
    """
    stop_words = ['to', 'i', 'the', 'and', 'of']
    return word in stop_words

In [5]:
def add_word_to_count_map(wordMap, word):
    """
    Updates word count in dictionary:
    1. Skips stop words
    2. Initializes count if new word
    3. Increments count if existing word
    """
    if is_stop(word):
        return
    if not word in wordMap:
        wordMap[word] = 0
    wordMap[word] += 1

In [27]:
def make_word_count_map(fileName):
    """
    Reads a file and counts word frequencies:
    1. Opens file
    2. Splits into words
    3. Standardizes each word
    4. Counts occurrences
    Returns: (wordMap, total word count)
    """
    wordMap = {}
    nWords = 0
    with open(fileName ,encoding="utf-8") as f:
        for line in f:
            words = line.split(' ')
            for word in words:
                word = standardize(word)
                add_word_to_count_map(wordMap, word)
                nWords+= 1
    
    
    return wordMap, nWords


In [7]:
def get_word_prob(word_prob_map, word):
    """
    Gets probability of a word:
    Returns probability if word exists, EPSILON if not
    """
    if word in word_prob_map:
        return word_prob_map[word]
    return EPSILON


In [22]:
def calc_term_doc_given_author(prob_map, counts):
    """
    How likely is the document, given the counts of words in the doc
    and the authors prob_map
    """
    # prob = 1
    # for word, c_i in counts.items():
    #     p_word = get_word_prob(prob_map, word)
    #     prob *= p_word ** c_i
    #     print(word, c_i, prob)
    # return prob
    prob = 0
    for word, c_i in counts.items():
        p_word = get_word_prob(prob_map, word)
        prob += math.log(p_word) ** c_i
    return prob


In [9]:
def make_word_prob_map(fileName):
    """
    Calculates word probabilities:
    1. Counts word frequencies
    2. Converts counts to probabilities
    Returns: word probability dictionary
    """
    wordMap, nWords = make_word_count_map(fileName)
    # print(fileName)
    # print("----------------------------")
    # print(wordMap)
    # print("----------------------------")
    # print(nWords)
    # print("----------------------------")
    probabilityMap = {}
    for word in wordMap:
        count = wordMap[word]
        p = float(count) / nWords
        probabilityMap[word] = p
    return probabilityMap

In [21]:
def main():
    # Calculate all the ps and qs
    # Eg hamiltonWordProb['congress'] = 0.005
    # hamilton_word_prob['piech'] = 0.0
    # hamilton_word_prob['the'] = 0.001

    real_word_prob = make_word_prob_map('Real.txt')
    ai_word_prob = make_word_prob_map('Ai.txt')

    

    # Get the word count of the unknown document
    # Eg unknown_doc_count['congress'] = 5
    test_doc_count, n_words = make_word_count_map('Test.txt')

    # print("hamilton['congress']\t", hamilton_word_prob['congress'])
    # print("madison['congress']\t",  madison_word_prob['congress'])
    # print("doc_count['congress']\t", unknown_doc_count['congress'])
    # print("n_words", n_words)

    real_term = calc_term_doc_given_author(real_word_prob, test_doc_count)
    print('---'*10)
    ai_term = calc_term_doc_given_author(ai_word_prob, test_doc_count)
    # print("If term greater than zero than it is REAL otherwise AI")
    print("Real Term\t", real_term)
    print("Ai Term\t", ai_term)


    

if __name__ == '__main__':
    main()

------------------------------
Real Term	 0.0
Ai Term	 0.0
