In [None]:
%autosave 15
%matplotlib inline

import numpy as np
import scipy as sp
import pandas as pd
import math as math
import random
import os

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
SPAM = "spmsg"
LEGIT = "legit"

class Email:
    def __init__(self, filename, lines):
        self.filename = filename
        self.isSpam = SPAM in filename
        
        self.title = list(map(int, (lines[0].strip().split())[1:]))
        self.body = []
        for line in lines[2:]:
            self.body += list(map(int, line.strip().split()))
    
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        kind = "LEGIT"
        if (self.isSpam):
            kind = "SPAM"
        return kind + " EMAIL " + self.filename + ": " + str(self.title) + ", " + str(self.body) + "\n"

In [None]:
class DictValue:
    def __init__(self):
        self.sTitle = 0
        self.lTitle = 0
        self.sBody = 0
        self.lBody = 0
        
    def foundInTitle(self, isSpam):
        if (isSpam):
            self.sTitle += 1
        else:
            self.lTitle += 1
            
    def foundInBody(self, isSpam):
        if (isSpam):
            self.sBody += 1
        else:
            self.lBody += 1
    
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        t = (self.sTitle, self.lTitle)
        b = (self.sBody, self.lBody)
        return "TITLE: " + str(t) + ", BODY: " + str(b)

In [None]:
PARTS_DIR = "./Bayes/pu1/"
PARTS_NUM = 10

def getData():
    parts = []  
    for i in range(PARTS_NUM):
        curPart = PARTS_DIR + "part" + str(i + 1)
        emails = []
        for filename in os.listdir(curPart):
            file = open(curPart + "/" + filename, 'r')
            emails.append(Email(filename, file.readlines()))
            file.close()
        parts.append(emails)
    return parts

parts = getData()

In [None]:
# takes: array of emails
# returns: Dict where keys are words and values are DictValues (contain key-word statistic for all given emails)
def learnBayes(emails):
    dictName = {}
    counters = [0, 0]
    unique = set()
    for email in emails:
        for word in email.title:
            unique.add(word)
            dictName.setdefault(word, DictValue()).foundInTitle(email.isSpam)
            if email.isSpam:
                counters[0] += 1
            else:
                counters[1] += 1
        for word in email.body:
            unique.add(word)
            dictName.setdefault(word, DictValue()).foundInBody(email.isSpam)
            if email.isSpam:
                counters[0] += 1
            else:
                counters[1] += 1

    return dictName, counters, len(unique)
            
#print(learnBayes(parts[0]))

In [None]:
# titleWeight = (0..1)
def testBayes(spamDict, counters, emails, ns, n, q, z):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    min_diff = 10000000000
    max_diff = -10000000000
    for email in emails:
        p_spam = math.log(ns / n)
        p_legit = math.log((n - ns) / n)
        
        for word in email.title:
            wf = spamDict.get(word, DictValue())
            p_spam += math.log((wf.sTitle + wf.sBody + z) / (counters[0] + z * q))
            p_legit += math.log((wf.lTitle + wf.lBody + z) / (counters[1] + z * q))

        for word in email.body:
            wf = spamDict.get(word, DictValue())
            p_spam += math.log((wf.sTitle + wf.sBody + z) / (counters[0] + q * z))
            p_legit += math.log((wf.lTitle + wf.lBody + z) / (counters[1] + q * z))

        isCurrentSpam = p_spam > p_legit
        if (email.isSpam != isCurrentSpam):
            if isCurrentSpam:
                FP += 1
            else:
                FN += 1
        else:
            if isCurrentSpam:
                TP += 1
            else:
                TN += 1
        
        if (p_spam - p_legit > max_diff):
            max_diff = p_spam - p_legit
        if (p_spam - p_legit < min_diff):
            min_diff = p_spam - p_legit
    if TP == 0:
        return 0
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    return 2 * (precision * recall) / (precision + recall), min_diff, max_diff

In [None]:
def cv():
    best_result = None
    best_score  = -1

    min_best = 10000000000
    max_best = -10000000000
    for i in range(PARTS_NUM):
        learningSet = []
        for j in range(PARTS_NUM):
            if (j != i):
                learningSet += parts[j]
                
        ns = 0
        n = len(learningSet)
        for email in learningSet:
            if (email.isSpam):
                ns += 1
                
        testingSet  = parts[i]
        
        
        spamDict, counters, unique = learnBayes(learningSet)
        score, min_diff, max_diff = testBayes(spamDict, counters, testingSet, ns, n, unique, 0.9706422018348624)

        if min_diff < min_best:
            min_best = min_diff
        if max_diff > max_best:
            max_best = max_diff

        if score > best_score:
            best_score = score
            best_result = spamDict, counters, testingSet, ns, n, unique
        print(score)
    
    return best_result, min_best, max_best

In [None]:
result, min_v, max_v = cv()
spamDict, counters, testingSet, ns, n, unique = result
print(min_v, max_v)

In [None]:
def getROC(spamDict, counters, emails, ns, n, q, z, w):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    
    P = 0
    N = 0
    for email in emails:
        if email.isSpam:
            P += 1
        else:
            N += 1
            
        p_spam = math.log(ns / n)
        p_legit = math.log((n - ns) / n)
        
        for word in email.title:
            wf = spamDict.get(word, DictValue())
            p_spam += math.log((wf.sTitle + wf.sBody + z) / (counters[0] + z * q))
            p_legit += math.log((wf.lTitle + wf.lBody + z) / (counters[1] + z * q))

        for word in email.body:
            wf = spamDict.get(word, DictValue())
            p_spam += math.log((wf.sTitle + wf.sBody + z) / (counters[0] + q * z))
            p_legit += math.log((wf.lTitle + wf.lBody + z) / (counters[1] + q * z))

        isCurrentSpam = p_spam > p_legit
        if p_spam - p_legit < w:
            if (email.isSpam != isCurrentSpam):
                if isCurrentSpam:
                    FP += 1
                else:
                    FN += 1
            else:
                if isCurrentSpam:
                    TP += 1
                else:
                    TN += 1
    return TP / P, FP / N

In [None]:
npoints = []
FPR_MAX = -10000000000000000
TPR_MAX = -10000000000000000
for w in np.arange(min_v, max_v, 50):
    FPR, TPR = getROC(spamDict, counters, testingSet, ns, n, unique, 0.9406422018348624, w)
    npoints += [(TPR, FPR)]
    if FPR > FPR_MAX:
        FPR_MAX = FPR
    if TPR > TPR_MAX:
        TPR_MAX = TPR
points = []

for TPR, FPR in npoints:
    points += [np.array([TPR/TPR_MAX, FPR/FPR_MAX])]

points = np.array(points)


In [None]:
print(points)
plt.plot(points[:,0], points[:,1])