In [None]:
%autosave 15
%matplotlib inline

import numpy as np
import scipy as sp
import pandas as pd
import math as math
import random
import os

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [None]:
SPAM = "spmsg"
LEGIT = "legit"

class Email:
    def __init__(self, filename, lines):
        self.filename = filename
        self.isSpam = SPAM in filename
        
        self.title = list(map(int, (lines[0].strip().split())[1:]))
        self.body = []
        for line in lines[2:]:
            self.body += list(map(int, line.strip().split()))
    
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        kind = "LEGIT"
        if (self.isSpam):
            kind = "SPAM"
        return kind + " EMAIL " + self.filename + ": " + str(self.title) + ", " + str(self.body) + "\n"

In [None]:
class DictValue:
    def __init__(self):
        self.sTitle = 0
        self.lTitle = 0
        self.sBody = 0
        self.lBody = 0
        self.s = 0
        self.l = 0
        
    def foundInTitle(self, isSpam):
        if (isSpam):
            self.sTitle += 1
            self.s += 1
        else:
            self.lTitle += 1
            self.l += 1
            
    def foundInBody(self, isSpam):
        if (isSpam):
            self.sBody += 1
            self.s += 1
        else:
            self.lBody += 1
            self.l += 1
    
    def __repr__(self):
        return str(self)
    
    def __str__(self):
        t = (self.sTitle, self.lTitle)
        b = (self.sBody, self.lBody)
        return "S: " + str(self.s) + ", L: " + str(self.l)

In [None]:
PARTS_DIR = "./Bayes/pu1/"
PARTS_NUM = 10

def getData():
    parts = []  
    for i in range(PARTS_NUM):
        curPart = PARTS_DIR + "part" + str(i + 1)
        emails = []
        for filename in os.listdir(curPart):
            file = open(curPart + "/" + filename, 'r')
            emails.append(Email(filename, file.readlines()))
            file.close()
        parts.append(emails)
    return parts

parts = getData()

In [None]:
# takes: array of emails
# returns: Dict where keys are words and values are DictValues (contain key-word statistic for all given emails)
def learnBayes(emails):
    d = {}
    s_t, s_b = 0, 0
    l_t, l_b = 0, 0
    for email in emails:
        for word in email.title:
            d.setdefault(word, DictValue()).foundInTitle(email.isSpam)
            if (email.isSpam):
                s_t += 1
            else:
                l_t += 1
        for word in email.body:
            d.setdefault(word, DictValue()).foundInBody(email.isSpam)
            if (email.isSpam):
                s_b += 1
            else:
                l_b += 1   
                
    print((s_t + s_b), (l_t + l_b) )   
    for key in d.keys():
        val = d.get(key)
        print(val)
        val.sTitle /= s_t
        val.lTitle /= l_t
        val.sBody /= s_b
        val.lBody /= l_b
        val.s /= (s_t + s_b)
        val.l /= (l_t + l_b) 
        print(val)
        print()
    
    
    return (d, s_t + s_b, l_t + l_b)
            
learnBayes(parts[0])

In [None]:
# titleWeight = (0..1)
def testBayes(spamDict, emails, ns, n, spamWeight = 1, legitWeight = 1, titleWeight = 0.5):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for email in emails:
        p_spam = math.log(ns / n) * spamWeight
        p_legit = math.log((n - ns) / n) * legitWeight
        
        for word in email.title:
            wf = spamDict.get(word, DictValue())
            if (wf.s > 0):
                p_spam += math.log(wf.s)
            if (wf.l > 0):
                p_legit += math.log(wf.l)
        
        #for word in email.title:
        #    wf = spamDict.get(word, DictValue())
        #    if (wf.sTitle + wf.lTitle > 0):
        #        x = 1 / (wf.sTitle + wf.lTitle)
        #        if (wf.sTitle > 0):
        #            p_spam += math.log(wf.sTitle * x)
        #        if (wf.lTitle > 0):
        #            p_legit += math.log(wf.lTitle * x)

        #for word in email.body:
        #    wf = spamDict.get(word, DictValue())
        #    if (wf.sBody + wf.lBody > 0):
        #        x = 1 / (wf.sBody + wf.lBody)
        #        if (wf.sBody > 0):
        #            p_spam += math.log(wf.sBody * x)
        #        if (wf.lBody > 0):
        #            p_legit += math.log(wf.lBody * x)
                    
        #res.append((email, p_legit >= p_spam))
        #res.append((email.isSpam, p_spam >= p_legit))
        
        print(p_spam, p_legit)
        isCurrentSpam = p_spam >= p_legit
        if (email.isSpam != isCurrentSpam):
            if isCurrentSpam:
                FP += 1
            else:
                FN += 1
        else:
            if isCurrentSpam:
                TP += 1
            else:
                TN += 1
    if TP == 0:
        return 0
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    return 2 * (precision * recall) / (precision + recall)

In [None]:
def cv(spamWeight = 1, legitWeight = 1, titleWeight = 0.5):
    best_result = None
    best_score  = -1

    avScore = 0
    for i in range(PARTS_NUM):
        learningSet = []
        for j in range(PARTS_NUM):
            if (j != i):
                learningSet += parts[j]
                
        testingSet  = parts[i]
        
        spamDict, s, l = learnBayes(learningSet)   
        
        score = testBayes(spamDict, testingSet, s, s + l, spamWeight, legitWeight, titleWeight)
        if score > best_score:
            best_score = score
            best_result = spamDict
        print(score)
        avScore += score
    
    return avScore / PARTS_NUM

In [None]:
cv()

In [None]:
for sw in range(1, 50, 10):
    for lw in range(1, 50, 10):
        score = cv(sw, lw)
        print(sw, lw, score)
    