<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/bogyung/SpamFiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 스팸 필터링

In [89]:
import pandas as pd
mail = [["me free lottery", "spam"],
        ["free get free you", "spam"],
        ["you free scholarship", "normal"],
        ["free to contact me", "normal"],
        ["you won award", "normal"],
        ["you ticket lottery", "spam"]]
df = pd.DataFrame(mail, columns = ["tokens", "label"])
df # input

Unnamed: 0,tokens,label
0,me free lottery,spam
1,free get free you,spam
2,you free scholarship,normal
3,free to contact me,normal
4,you won award,normal
5,you ticket lottery,spam


In [90]:
df.tokens = df.tokens.map(lambda x: x.split())
df # input

Unnamed: 0,tokens,label
0,"[me, free, lottery]",spam
1,"[free, get, free, you]",spam
2,"[you, free, scholarship]",normal
3,"[free, to, contact, me]",normal
4,"[you, won, award]",normal
5,"[you, ticket, lottery]",spam


In [91]:
tokens = []
for token in df.tokens:
    for i in range(len(token)):
        tokens.append(token[i])
tokens = np.unique(tokens)

spam = {}
normal = {}
for token in tokens:
    spam[token] = 0
    normal[token] = 0

In [92]:
print(spam)
print(normal)

{'award': 0, 'contact': 0, 'free': 0, 'get': 0, 'lottery': 0, 'me': 0, 'scholarship': 0, 'ticket': 0, 'to': 0, 'won': 0, 'you': 0}
{'award': 0, 'contact': 0, 'free': 0, 'get': 0, 'lottery': 0, 'me': 0, 'scholarship': 0, 'ticket': 0, 'to': 0, 'won': 0, 'you': 0}


In [93]:
for i, token_list in enumerate(df.tokens):
    for token in token_list:
        if df.label[i] == "spam":
            spam[token] += 1
        elif df.label[i] == "normal":
            normal[token] += 1

In [94]:
print(spam)
print(normal)

{'award': 0, 'contact': 0, 'free': 3, 'get': 1, 'lottery': 2, 'me': 1, 'scholarship': 0, 'ticket': 1, 'to': 0, 'won': 0, 'you': 2}
{'award': 1, 'contact': 1, 'free': 2, 'get': 0, 'lottery': 0, 'me': 1, 'scholarship': 1, 'ticket': 0, 'to': 1, 'won': 1, 'you': 2}


In [102]:
filter = pd.DataFrame({"spam":spam, "normal":normal})
filter

Unnamed: 0,spam,normal
award,0,1
contact,0,1
free,3,2
get,1,0
lottery,2,0
me,1,1
scholarship,0,1
ticket,1,0
to,0,1
won,0,1


In [109]:
k = 0.5
n_spam = np.sum(filter.spam)
n_normal = np.sum(filter.normal)

filter["P(w|spam)"] = (k+filter.spam)/(2*k+n_spam)
filter["P(w|normal)"] = (k+filter.normal)/(2*k+n_normal)
filter["Log(P(w|spam))"] = np.log(filter["P(w|spam)"])
filter["Log(P(w|normal))"] = np.log(filter["P(w|normal)"])

filter

Unnamed: 0,spam,normal,P(w|spam),P(w|normal),Log(P(w|spam)),Log(P(w|normal))
award,0,1,0.045455,0.136364,-3.091042,-1.99243
contact,0,1,0.045455,0.136364,-3.091042,-1.99243
free,3,2,0.318182,0.227273,-1.145132,-1.481605
get,1,0,0.136364,0.045455,-1.99243,-3.091042
lottery,2,0,0.227273,0.045455,-1.481605,-3.091042
me,1,1,0.136364,0.136364,-1.99243,-1.99243
scholarship,0,1,0.045455,0.136364,-3.091042,-1.99243
ticket,1,0,0.136364,0.045455,-1.99243,-3.091042
to,0,1,0.045455,0.136364,-3.091042,-1.99243
won,0,1,0.045455,0.136364,-3.091042,-1.99243


In [181]:
input_tokens = "free lottery".split()
P_spam = np.log( n_spam/(n_spam+n_normal) )
P_normal = np.log( n_normal/(n_spam+n_normal) )

for token in input_tokens:
    P_spam += float( filter[filter.index == token]["Log(P(w|spam))"] )
    P_normal += float( filter[filter.index == token]["Log(P(w|normal))"] )

P_spam_fin = np.exp(P_spam) / ( np.exp(P_spam) + np.exp(P_normal) )
P_normal_fin = np.exp(P_normal) / ( np.exp(P_spam) + np.exp(P_normal) )
print(round(P_spam_fin*100, 2),"%", round(P_normal_fin*100, 2),"%")

87.5 % 12.5 %


# 클래스

In [213]:
import pandas as pd
import numpy as np

class SpamFiltering:
    def __init__(self, data, input_tokens, k = 0.5):
        self.data = data
        self.input_tokens = input_tokens
        self.k = k
    
    def percent_table(self):
        tokens = []
        for token in self.data.tokens:
            for i in range(len(token)):
                tokens.append(token[i])
        tokens = np.unique(tokens)
        
        spam = {}
        normal = {}
        for token in tokens:
            spam[token] = 0
            normal[token] = 0       

        for i, token_list in enumerate(self.data.tokens):
            for token in token_list:
                if self.data.label[i] == "spam":
                    spam[token] += 1
                elif self.data.label[i] == "normal":
                    normal[token] += 1

        filter = pd.DataFrame({"spam":spam, "normal":normal})
        n_spam = np.sum(filter.spam)
        n_normal = np.sum(filter.normal)
        filter["P(w|spam)"] = (self.k+filter.spam)/(2*self.k+n_spam)
        filter["P(w|normal)"] = (self.k+filter.normal)/(2*self.k+n_normal)
        filter["Log(P(w|spam))"] = np.log(filter["P(w|spam)"])
        filter["Log(P(w|normal))"] = np.log(filter["P(w|normal)"])
        return filter
    
    def percent_value(self):
        table = self.percent_table()
        P_spam = np.log( n_spam/(n_spam+n_normal) )
        P_normal = np.log( n_normal/(n_spam+n_normal) )
        
        for token in self.input_tokens.split():
            P_spam += float( table[table.index == token]["Log(P(w|spam))"] )
            P_normal += float( table[table.index == token]["Log(P(w|normal))"] )
        
        P_spam_fin = np.exp(P_spam) / ( np.exp(P_spam) + np.exp(P_normal) )
        P_normal_fin = np.exp(P_normal) / ( np.exp(P_spam) + np.exp(P_normal) )
        return print(" spam:", round(P_spam_fin*100, 2),"%\n", "normal:", round(P_normal_fin*100, 2),"%")

In [217]:
sf = SpamFiltering(data = df, input_tokens = "get lottery")
sf.percent_value()

 spam: 93.75 %
 normal: 6.25 %


In [219]:
sf = SpamFiltering(data = df, input_tokens = "scholarship award")
sf.percent_value()

 spam: 10.0 %
 normal: 90.0 %
