# 인공지능 HW5
### : SVM을 이용한 스팸 메일 분류
- 학번: 201711719
- 학과: 응용통계학과
- 이름: 심은선
- 제출날짜: 2020.10.01

# 1. Load Data

In [1]:
from google.colab import drive
drive.mount("/gdrive", force_remount=True)

Mounted at /gdrive


In [2]:
import typing
from typing import List

In [3]:
# 데이터 읽는 함수 정의
def load_data(file_path : str):
    # 파일 읽기
    with open(file_path,'r',encoding='utf8') as inFile:
        lines = inFile.readlines()

    x_data, y_data = [], []
    for line in lines:
        pieces = line.strip().split('\t')
        label, sentence = pieces[0], pieces[1]
        x_data.append(sentence)
        y_data.append(label)
        
    return x_data, y_data

In [4]:
file_path = "/gdrive/My Drive/인공지능(4-2)/wk5.SVM/SMSSpamCollection"
X, y = load_data(file_path)

print("x_data의 개수 : " + str(len(X)))
print("y_data의 개수 : " + str(len(y)))

x_data의 개수 : 1500
y_data의 개수 : 1500


# 2. Preprocessing

### 1) Tokenizer로 문장 embedding

In [5]:
from keras.preprocessing.text import Tokenizer

In [6]:
# spam, ham 라벨을 대응하는 index로 치환하기위한 딕셔너리
label2index = {'spam':0, 'ham':1}
index2label = {0:"spam", 1:"ham"}

# indexing 한 데이터를 넣을 리스트 선언
indexing_X, indexing_y = [], []

for label in y:
  indexing_y.append(label2index[label])

tokenizer = Tokenizer(num_words=300) #상위 300개 단어로 vocabulary 구성

# x_data를 사용하여 딕셔너리 생성 (vocabulary)
tokenizer.fit_on_texts(X)                

# x_data에 있는 각 문장들을 one-hot 벡터의 합으로 치환하고 그 결과값을 indexing_x_data에 저장
indexing_X = tokenizer.texts_to_matrix(X, mode='count').tolist()

print("x_data indexing 전 : " + str(X[0]))
print("x_data indexing 후 : " + str(indexing_X[0]))
print("y_data indexing 전 : " + str(y[0]))
print("y_data indexing 후 : " + str(indexing_y[0]))

x_data indexing 전 : Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
x_data indexing 후 : [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

### 2) 카이제곱으로 문장 embedding

In [7]:
import numpy as np

In [8]:
# label로 문서 분리
def split_doc(X:List, y:List, label:str):
  ret = [x for idx, x in enumerate(X) if y[idx]==label]
  return ret


# 문장마다 단어 set을 얻는다.
def word_in_a_doc(doc:str):
  return set(doc.split())


# 모든 문장의 단어 set 얻기
def all_word(word_sets: List):
  total_word = set()
  for word_set in word_sets:
    total_word  = total_word.union(word_set)
  return total_word


# 모든 단어를 반복하면서 카이제곱 구하기
def word_chisquare(sets_1: List, sets_2: List, total_word: set):
  chi = dict()
  p_len = len(sets_1)
  n_len = len(sets_2)
  for word in total_word:
    A = sum([ word in s for s in sets_1 ])
    B = sum([ word in s for s in sets_2 ])
    C = p_len - A
    D = n_len - B
    chi_val = (A+B+C+D) * ((A*D-C*B)**2)
    chi_val /= ((A+C)*(B+D)*(A+B)*(C+D))
    chi[word] = chi_val
  return chi


# 상위 N개의 카이제곱 단어 구하기
def top_N_chiword(chi: dict, N: int):
  chi_sort = sorted(chi.items(), key=lambda x: x[1], reverse=True)
  N = min(N, len(chi))
  words, val = zip(*chi_sort[:N])
  return words, val


# vocabulary 기반 voca-idx 딕셔너리 생성
def word_to_idx(words):
    voca_dict = {}
    for idx, voca in enumerate(words):
        voca_dict[voca] = idx
    return voca_dict


# vocabulary로 문장 임베딩 : 포함여부 (0, 1)로 임베딩
def embed_docs(docs: List, vocab: set):
    word2idx = word_to_idx(vocab)
    vec_dim = len(vocab)
    N = len(docs) # 문서 개수
    
    docs_embed = np.zeros((N, vec_dim)) # 문서임베딩 벡터의 모임 -> 행렬
    for i, doc in enumerate(docs):
        words = doc.split()
        for word in words:
            if word in vocab: # 단어가 vocabulary에 있을 때만
                idx = word2idx[word]
                docs_embed[i][idx] = 1 # 포함 여부만
    return docs_embed

In [9]:
spam_X = split_doc(X, y, "spam")
ham_X = split_doc(X, y, "ham")

spam_X_sets = [word_in_a_doc(doc) for doc in spam_X]
ham_X_sets = [word_in_a_doc(doc) for doc in ham_X]
total_word_set = all_word(spam_X_sets+ham_X_sets)

chi_square = word_chisquare(spam_X_sets, ham_X_sets, total_word_set)
top_N_word, _ = top_N_chiword(chi_square, 200)

chi_embedding = embed_docs(X, top_N_word)

In [10]:
chi_embedding.shape

(1500, 200)

### 3) Tokenizer, 카이제곱 embedding 합치기

In [11]:
embedding_X = np.concatenate((indexing_X, chi_embedding), axis=1)

In [12]:
embedding_X.shape

(1500, 500)

# 3. Training & prediction: SVM

### 1) Training

In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [14]:
# train:test = 9:1 split
X_train, X_test, y_train, y_test = train_test_split(embedding_X, indexing_y, test_size=0.1, random_state=2020)

In [15]:
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### 2) prediction

In [16]:
predict = svm.predict(X_test)

### 3) 모델 성능 평가: Accuracy

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
print('Accuracy: %.2f' % accuracy_score(y_test, predict))

Accuracy: 0.99
