In [14]:
import os
import sys
import collections
import re
import math
import copy

In [39]:
class Document:
    """
    Document class to store email instances easier
    """
    title = ""
    # text = ""
    # x0 assumed 1 for all documents (training examples)
    word_freqs = {'weight_zero': 1.0}

    # spam or ham
    true_class = None
    learned_class = None

    # Constructor
    def __init__(self,
                 # text,
                 bag,
                 true_class,
                 title):
        # self.text = text
        self.bag_of_words = bag
        self.true_class = true_class
        self.title = title

    # def get_text(self):
    #     return self.text

    def get_word_freqs(self):
        return self.word_freqs

    def get_true_class(self):
        return self.true_class

    def get_learned_class(self):
        return self.learned_class

    def set_learned_class(self, guess):
        self.learned_class = guess

In [43]:
def bag_of_words(text):
    """
    counts frequency of each word in given text
    order of sequence doesn't matter
    :param text:
    :return:
    """
    bags_of_words = collections.Counter(re.findall(r'\w+', text))
    return dict(bags_of_words)

# bag = bag_of_words("hello world it is i Achilles")
# print(bag)

{'hello': 1, 'world': 1, 'it': 1, 'is': 1, 'i': 1, 'Achilles': 1}


In [47]:
def make_data_set(directory, true_class):
    """
    Read all text files in the given directory and construct the data set, D
    :param directory: the directory path should just be like "train/ham" for example
    :param true_class: True class is the true classification of the email (spam or ham)
    :return: the dictionary to store the email in
    """
    dictionary = dict()
    for dir_entry in os.listdir(directory):
        dir_entry_path = os.path.join(directory, dir_entry)
        if os.path.isfile(dir_entry_path):
            with open(dir_entry_path, 'r', errors='ignore') as text_file:
                # stores dictionary of dictionary of dictionary as explained above in the initialization
                key = dir_entry[:4]
                # print(key)
                title = dir_entry
                text = text_file.read()
                bag = bag_of_words(text)
                dictionary.update({key: Document(bag, true_class, title)})
    print("dataset built!")
    return dictionary


temp_path = "data/dataset 1/train/spam"
temp_class = "spam"
make_data_set(temp_path, temp_class)

TypeError: __init__() missing 1 required positional argument: 'title'

In [18]:
def extract_vocab(data_set):
    """
    Extracts the vocabulary of all the text in a data set
    :param data_set: 
    :return: 
    """
    v = []
    for i in data_set:
        for j in data_set[i].getWordFreqs():
            if j not in v:
                v.append(j)
    return v

In [19]:
def set_stop_words():
    """
    Set the stop words
    :return: 
    """
    stop_words_dir = 'stop_words.txt'
    with open(stop_words_dir, 'r') as txt:
        stops = (txt.read().splitlines())
    return stops

In [20]:
def remove_stop_words(stops, data_set):
    """
    Remove stop words from data set and store in dictionary
    :param stops: 
    :param data_set: 
    :return: 
    """
    filtered_data_set = copy.deepcopy(data_set)
    for i in stops:
        for j in filtered_data_set:
            if i in filtered_data_set[j].getWordFreqs():
                del filtered_data_set[j].getWordFreqs()[i]
    return filtered_data_set

In [22]:
def learn_weights(training_set, weights_param, iterations, lam, learning_constant):
    """
    Learn weights by using gradient ascent
    :param learning_constant:
    :param training_set:
    :param weights_param: 
    :param iterations: 
    :param lam: 
    :return: 
    """
    # Adjust weights num_iterations times
    print('Iteration number (out of {}}):'.format(iterations))
    for x in range(0, iterations):
        print(x)
        # Adjust each weight...
        counter = 1
        for w in weights_param:
            sum = 0.0
            # ...using all training instances
            for i in training_set:
                # y_sample is true y value (classification) of the doc
                y_sample = 0.0
                if training_set[i].getTrueClass() == classes[1]:
                    y_sample = 1.0
                # Only add to the sum if the doc contains the token (the count of it would be 0 anyways)
                if w in training_set[i].getWordFreqs():
                    sum += float(training_set[i].getWordFreqs()[w]) * (
                            y_sample - calculate_cond_prob(classes[1], weights_param, training_set[i]))
            weights_param[w] += ((learning_constant * sum) - (learning_constant * float(lam) * weights_param[w]))

# learn_weights()

In [23]:
def calculate_cond_prob(class_prob, weights_param, doc):
    """
    Calculate conditional probability for the specified doc. Where class_prob is 1|X or 0|X
    1 is spam and 0 is ham
    :param class_prob: 
    :param weights_param: 
    :param doc: 
    :return: 
    """
    # Total tokens in doc. Used to normalize word counts to stay within 0 and 1 for avoiding overflow

    # Handle 0
    if class_prob == classes[0]:
        sum_wx_0 = weights_param['weight_zero']
        for i in doc.getWordFreqs():
            if i not in weights_param:
                weights_param[i] = 0.0
            # sum of weights * token count for each token in each document
            sum_wx_0 += weights_param[i] * float(doc.getWordFreqs()[i])
        return 1.0 / (1.0 + math.exp(float(sum_wx_0)))
    # Handle 1
    elif class_prob == classes[1]:
        sum_wx_1 = weights_param['weight_zero']
        for i in doc.getWordFreqs():
            if i not in weights_param:
                weights_param[i] = 0.0
            # sum of weights * token count for each token in each document
            sum_wx_1 += weights_param[i] * float(doc.getWordFreqs()[i])
        return math.exp(float(sum_wx_1)) / (1.0 + math.exp(float(sum_wx_1)))

In [24]:
def apply_logistic_regression(data_instance, weights_param):
    """
    Apply algorithm to guess class for specific instance of test set
    :param data_instance: 
    :param weights_param: 
    :return: 
    """
    score = {}
    score[0] = calculate_cond_prob(classes[0], weights_param, data_instance)
    score[1] = calculate_cond_prob(classes[1], weights_param, data_instance)
    if score[1] > score[0]:
        return classes[1]
    else:
        return classes[0]

In [31]:
def build_data_sets(training_spam_dir, training_ham_dir, test_spam_dir, test_ham_dir):
    """
    takes directories holding the data text files as parameters. "train/ham" for example
    :param training_spam_dir:
    :param training_ham_dir:
    :param test_spam_dir:
    :param test_ham_dir:
    :return:
    """
    # Stores emails as dictionaries. email_file_name : Document (class defined below)
    training_set = dict()
    test_set = dict()

    # Filtered sets without stop words
    filtered_training_set = dict()
    filtered_test_set = dict()

    # Set up initial data sets. Dictionaries containing the text, word frequencies, and true/learned classifications
    make_data_set(training_set, training_spam_dir, classes[1])
    make_data_set(training_set, training_ham_dir, classes[0])
    make_data_set(test_set, test_spam_dir, classes[1])
    make_data_set(test_set, test_ham_dir, classes[0])

    # Set the stop words list
    stop_words = set_stop_words()

    # Set up datasets without stop words
    filtered_training_set = remove_stop_words(stop_words, training_set)
    filtered_test_set = remove_stop_words(stop_words, test_set)

    # Vocabulary/tokens in the training set
    # Extract training set vocabulary
    training_set_vocab = extract_vocab(training_set)
    filtered_training_set_vocab = extract_vocab(filtered_training_set)

    # store weights as dictionary. w0 initially 0.0, others initially 0.0. token : weight value
    weights = {'weight_zero': 0.0}
    filtered_weights = {'weight_zero': 0.0}

    # Set all weights in training set vocabulary to be initially 0.0. w0 ('weight_zero') is initially 0.0
    for i in training_set_vocab:
        weights[i] = 0.0
    for i in filtered_training_set_vocab:
        filtered_weights[i] = 0.0

    return filtered_training_set, filtered_test_set, training_set_vocab, filtered_training_set_vocab, weights, filtered_weights


dataset_root = "data/dataset 1/"
training_spam_dir = dataset_root + "train/spam"
training_ham_dir = dataset_root + "train/ham"
test_spam_dir = dataset_root + "test/spam"
test_ham_dir = dataset_root + "test/ham"
build_data_sets(training_spam_dir, training_ham_dir, test_spam_dir, test_ham_dir)

data/dataset 1/train/spam
data/dataset 1/train/ham
data/dataset 1/test/spam
data/dataset 1/test/ham


({'data/dataset 1/train/spam\\0100.2003-12-28.GP.spam.txt': <__main__.Document at 0x17039688250>,
  'data/dataset 1/train/spam\\0170.2004-01-09.GP.spam.txt': <__main__.Document at 0x17039688df0>,
  'data/dataset 1/train/spam\\0176.2004-01-11.GP.spam.txt': <__main__.Document at 0x17039688fd0>,
  'data/dataset 1/train/spam\\0204.2004-01-13.GP.spam.txt': <__main__.Document at 0x170390582b0>,
  'data/dataset 1/train/spam\\0250.2004-01-20.GP.spam.txt': <__main__.Document at 0x17038fedc70>,
  'data/dataset 1/train/spam\\0312.2004-01-26.GP.spam.txt': <__main__.Document at 0x170395fd250>,
  'data/dataset 1/train/spam\\0387.2004-02-07.GP.spam.txt': <__main__.Document at 0x170395fd370>,
  'data/dataset 1/train/spam\\0528.2004-02-21.GP.spam.txt': <__main__.Document at 0x170395fd7f0>,
  'data/dataset 1/train/spam\\0533.2004-02-21.GP.spam.txt': <__main__.Document at 0x170395fdd90>,
  'data/dataset 1/train/spam\\0557.2004-02-23.GP.spam.txt': <__main__.Document at 0x170395fd3d0>,
  'data/dataset 1/tr

In [29]:
def logistic_regression(training_set, test_set, lambda_constant, learning_constant, num_iterations, penalty, weights,
                        filtered_weights):
    """
    :param learning_constant:
    :param test_set:
    :param training_set:
    :param filtered_weights:
    :param lambda_constant:
    :return: 
    """
    penalty = lambda_constant

    # Learn weights
    learn_weights(training_set, weights, num_iterations, penalty)
    learn_weights(filtered_training_set, filtered_weights, num_iterations, penalty)

    # Apply algorithm on test set
    correct_guesses = 0.0
    for i in test_set:
        test_set[i].setLearnedClass(apply_logistic_regression(test_set[i], weights))
        if test_set[i].getLearnedClass() == test_set[i].getTrueClass():
            correct_guesses += 1.0

    # Apply algorithm on filtered test set
    correct_guesses_filtered = 0.0
    for i in filtered_test_set:
        filtered_test_set[i].setLearnedClass(apply_logistic_regression(filtered_test_set[i], filtered_weights))
        if filtered_test_set[i].getLearnedClass() == filtered_test_set[i].getTrueClass():
            correct_guesses_filtered += 1.0

    print("Correct guesses before filtering stop words:\t%d/%s" % (correct_guesses, len(test_set)))
    print("Accuracy before filtering stop words:\t\t\t%.4f%%" % (100.0 * float(correct_guesses) / float(len(test_set))))
    print("Correct guesses after filtering stop words:\t\t%d/%s" % (correct_guesses_filtered, len(filtered_test_set)))
    print("Accuracy after filtering stop words:\t\t\t%.4f%%" % (
            100.0 * float(correct_guesses_filtered) / float(len(filtered_test_set))))

In [28]:

dataset_root = "data/dataset 1/"
training_spam_dir = dataset_root + "train/spam"
training_ham_dir = dataset_root + "train/ham"
test_spam_dir = dataset_root + "test/spam"
test_ham_dir = dataset_root + "test/ham"

# ham = 0 for not spam, spam = 1 for is spam
classes = ["ham", "spam"]

lambda_constant = .001

# Stores emails as dictionaries. email_file_name : Document (class defined below)
# training_set = dict()
# test_set = dict()
#
# # Filtered sets without stop words
# filtered_training_set = dict()
# filtered_test_set = dict()
#
# # list of Stop words
# stop_words = []
#
# # Vocabulary/tokens in the training set
# training_set_vocab = []
# filtered_training_set_vocab = []
#
# # store weights as dictionary. w0 initially 0.0, others initially 0.0. token : weight value
# weights = {'weight_zero': 0.0}
# filtered_weights = {'weight_zero': 0.0}
#
# # Natural learning rate constant, number of iterations for learning weights, and penalty (lambda) constant
# learning_constant = .001
# num_iterations = 100
# penalty = 0.0

Iteration number (out of 100):
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
Iteration number (out of 100):
0
1
2
3
4
5


KeyboardInterrupt: 

In [None]:
training_set, test_set, filtered_training_set, filtered_test_set = build_data_sets(training_spam_dir, training_ham_dir,
                                                                                   test_spam_dir, test_ham_dir)

logistic_regression(lambda_constant)