In [1]:
import pandas as pd
import numpy as np
import csv
import re

In [2]:
#konlpy 라이브러리
from konlpy.tag import Hannanum
from konlpy.tag import Okt
from konlpy.tag import Komoran
from konlpy.tag import Kkma
from eunjeon import Mecab

from konlpy.utils import pprint

In [3]:
#특수문자 제거 정의
def clean_signal(text):
    cleaned1 = re.sub('[\｝\｛\●\▣\①\◈\〔\〈\「\〉\」\『\♣\【\】\★\▶\》\《\☎\■\※\\n\\t\\r\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\=\(\'\"\♥\♡\ㅋ\ㅠ\ㅜ\ㄱ\ㅎ\ㄲ\ㅡ]','',str(text))
    return cleaned1

def clean_en(text):
    cleaned1 = re.sub('[\』\］\［\〕\◀\▼\♧\☏\\u3000\…\｝\｛\●\▣\①\◈\〔\〈\「\〉\」\『\♣\【\】\★\▶\》\《\☎\■\※\\n\\t\\r\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\=\(\'\"\♥\♡\ㅋ\ㅠ\ㅜ\ㄱ\ㅎ\ㄲ\ㅡ]','',str(text))
    cleaned2 = re.sub('[a-zA-Z0-9]','', cleaned1)
    return cleaned2

In [4]:
def morpheme_hannanum(result_Ser):
    han_list = list()

    for sentence in range(np.size(result_Ser,0)):
        text = result_Ser.values[sentence]
        text = clean_signal(text)
        text = clean_en(text)
        pos = hannanum.pos(text)
#         hannanum_word = list()
#         hannanum_tag = list()
        sub_list = list()
        for word, tag in pos:
            if tag in ['N']:
                sub_list.append(word)
            elif tag in ['P']:
                if len(word) > 1:
                    sub_list.append(word)
        han_list.append(sub_list)
    return han_list

In [5]:
def morpheme_okt(result_Ser):
    okt_list = list()

    for sentence in range(np.size(result_Ser,0)):
        text = result_Ser.values[sentence]
        text = clean_signal(text)
        text = clean_en(text)
        pos = okt.pos(text, norm = True)
#         okt_word = list()
#         okt_tag = list()
        sub_list = list()
        for word, tag in pos:
            if tag in ['Noun']:
                sub_list.append(word)
            elif tag in ['Verb']:
                if len(word) > 1:
                    sub_list.append(word)
        okt_list.append(sub_list)

    return okt_list

In [6]:
def morpheme_kkma(result_Ser):
    kkma_list = list()
    kkma_plus_list = list()
    
    for sentence in range(np.size(result_Ser,0)):
        text = result_Ser.values[sentence]
        text = clean_signal(text)
        text = clean_en(text)
        pos = kkma.pos(text)
    #     kkma_word = list()
    #     kkma_tag = list()
    
        sub_list = list()
        sub_plus_list = list()
        
        for word, tag in pos:
            if tag in ['NNG']:
                sub_list.append(word)
                sub_plus_list.append(word)
            elif tag in ['NNP']:
                sub_plus_list.append(word)
            elif tag in ['NP']:
                sub_plus_list.append(word)          
            elif tag in ['VV']:
                if len(word) > 1:
                    sub_list.append(word)
                    sub_plus_list.append(word)
                    
        kkma_plus_list.append(sub_plus_list)
        kkma_list.append(sub_list)
        

    return kkma_list, kkma_plus_list

In [7]:
def morpheme_komoran(result_Ser):
    komoran_list = list()
    komoran_plus_list = list()
    for sentence in range(np.size(result_Ser,0)):
        text = result_Ser.values[sentence]
        text = clean_signal(text)
        text = clean_en(text)
        pos = kkma.pos(text)
    #     komoran_word = list()
    #     komoran_tag = list()
    
        sub_list = list()
        sub_plus_list = list()
        
        for word, tag in pos:
            if tag in ['NNG']:
                sub_plus_list.append(word)
                sub_list.append(word)
            elif tag in ['NNP']:
                sub_plus_list.append(word)
            elif tag in ['NP']:
                sub_plus_list.append(word)
            elif tag in ['VV']:
                if len(word) > 1:
                    sub_list.append(word)
                    sub_plus_list.append(word)

        komoran_list.append(sub_list)
        komoran_plus_list.append(sub_plus_list)
    return komoran_list, komoran_plus_list

In [8]:
def morpheme_mecab(result_Ser):
    mecab_list = list()
    mecab_plus_list = list()

    for sentence in range(np.size(result_Ser,0)):
        text = result_Ser.values[sentence]
        text = clean_signal(text)
        text = clean_en(text)
        pos = mecab.pos(text)
    #     mecab_word = list()
    #     mecab_tag = list()
    
        sub_list = list()
        sub_plus_list = list()
        
        for word, tag in pos:
            if tag in ['NNG']:
                sub_plus_list.append(word)
                sub_list.append(word)
            elif tag in ['NNP']:
                sub_plus_list.append(word)
            elif tag in ['NP']:
                sub_plus_list.append(word)
            elif tag in ['VV']:
                if len(word) > 1:
                    sub_plus_list.append(word)
                    sub_list.append(word)
        mecab_list.append(sub_list)
        mecab_plus_list.append(sub_plus_list)
    return mecab_list, mecab_plus_list

In [9]:
def by_okt(sentences):
    byokt = list()
    for sentence in sentences:

        sub_list = list()
        for words in sentence:    
            pos = okt.pos(words, norm = True)

            for word, tag in pos:
                if tag in ['Noun']:
                    sub_list.append(word)
                elif tag in ['Verb']:
                    if len(word)>1:
                        sub_list.append(word)
        byokt.append(sub_list)
    return byokt

In [10]:
def create_docs(document):
    docs = list()
    count = 0
    for file in document:
        count += 1
        docs.extend(file)
    docs = np.unique(docs)
    docs_zero = np.zeros([count, len(docs)])
    docs = list(docs)
    for i in range(count):
        for j in range(len(document[i])):
            if document[i][j] in docs:
                docs_zero[i][docs.index(document[i][j])] += 1
    return docs_zero

In [11]:
def register_label(document, label_info):
    document = np.append(document, label_info, axis = 1)
    return document

In [12]:
hand_data = pd.read_excel('smishing_dataset_SY_.xlsx', sheet_name = 0)
police_data = pd.read_excel('smishing_dataset_SY_.xlsx', sheet_name = 1)

tr_hand = list()
for i in range(1, np.size(hand_data.values,0)):
    tr_hand.append(hand_data.values[i][4])
    
tr_police = list()
for i in range(np.size(police_data.values,0)):
    tr_police.append(police_data.values[i][3])
    
tr_data = np.hstack([tr_hand, tr_police])
tr_ind = np.ones(np.size(tr_data))
tr_Ser = pd.Series(tr_data)

In [13]:
# test_data = pd.read_csv('food_2021_12_28_file.txt', sep = '\n')
test_data = pd.read_csv('fashion_2022_1_4_file.txt', sep = '\n')
test_list = list()
for i in range(np.size(test_data.values,0)):
    test_list.append(test_data.values[i][0])
    
test_ind = np.zeros(np.size(test_list))

test_Ser = pd.Series(test_list) 

In [14]:
result_label = np.hstack([tr_ind, test_ind])[:,np.newaxis]
result_Ser = pd.concat([tr_Ser, test_Ser], axis = 0)
# result_Ser

In [15]:
hannanum = Hannanum()
okt = Okt()
komoran = Komoran(max_heap_size= 1024 * 6)
kkma = Kkma()
mecab = Mecab()

In [16]:
# basic morpheme analysis
han_sentences = morpheme_hannanum(result_Ser)
okt_sentences = morpheme_okt(result_Ser)
kkma_sentences, kkma_plus_sentences = morpheme_kkma(result_Ser)
komoran_sentences, komoran_plus_sentences = morpheme_komoran(result_Ser)
mecab_sentences, mecab_plus_sentences = morpheme_mecab(result_Ser)

In [17]:
# byokt
hannanum_org_byokt = by_okt(han_sentences)
okt_org_byokt = by_okt(okt_sentences)
kkma_org_byokt = by_okt(kkma_sentences)
kkma_plus_byokt = by_okt(kkma_plus_sentences)
komoran_org_byokt = by_okt(komoran_sentences)
komoran_plus_byokt = by_okt(komoran_plus_sentences)
mecab_org_byokt = by_okt(mecab_sentences)
mecab_plus_byokt = by_okt(mecab_plus_sentences)

In [18]:
# creative documentation matrix
hannanum_org = create_docs(han_sentences)
hannanum_org_byokt = create_docs(hannanum_org_byokt)
okt_org = create_docs(okt_sentences)
okt_org_byokt = create_docs(okt_org_byokt)
kkma_org = create_docs(kkma_sentences)
kkma_org_byokt = create_docs(kkma_org_byokt)
kkma_plus = create_docs(kkma_plus_sentences)
kkma_plus_byokt = create_docs(kkma_plus_byokt)
komoran_org = create_docs(komoran_sentences)
komoran_org_byokt = create_docs(komoran_org_byokt)
komoran_plus = create_docs(komoran_plus_sentences)
komoran_plus_byokt = create_docs(komoran_plus_byokt)
mecab_org = create_docs(mecab_sentences)
mecab_org_byokt = create_docs(mecab_org_byokt)
mecab_plus = create_docs(mecab_plus_sentences)
mecab_plus_byokt = create_docs(mecab_plus_byokt)

In [19]:
# add class label

hannanum_org = register_label(hannanum_org, result_label)
hannanum_org_byokt = register_label(hannanum_org_byokt, result_label)
okt_org = register_label(okt_org, result_label)
okt_org_byokt = register_label(okt_org_byokt, result_label)
kkma_org = register_label(kkma_org, result_label)
kkma_org_byokt = register_label(kkma_org_byokt, result_label)
kkma_plus = register_label(kkma_plus, result_label)
kkma_plus_byokt = register_label(kkma_plus_byokt, result_label)
komoran_org = register_label(komoran_org, result_label)
komoran_org_byokt = register_label(komoran_org_byokt, result_label)
komoran_plus = register_label(komoran_plus, result_label)
komoran_plus_byokt = register_label(komoran_plus_byokt, result_label)
mecab_org = register_label(mecab_org, result_label)
mecab_org_byokt = register_label(mecab_org_byokt, result_label)
mecab_plus = register_label(mecab_plus, result_label)
mecab_plus_byokt = register_label(mecab_plus_byokt, result_label)

In [23]:
hannanum_org_df = pd.DataFrame(hannanum_org)
hannanum_org_byokt_df = pd.DataFrame(hannanum_org_byokt)
okt_org_df = pd.DataFrame(okt_org)
okt_org_byokt_df = pd.DataFrame(okt_org_byokt)
kkma_org_df = pd.DataFrame(kkma_org)
kkma_org_byokt_df = pd.DataFrame(kkma_org_byokt)
kkma_plus_df = pd.DataFrame(kkma_plus)
kkma_plus_byokt_df = pd.DataFrame(kkma_plus_byokt)
komoran_org_df = pd.DataFrame(komoran_org)
komoran_org_byokt_df = pd.DataFrame(komoran_org_byokt)
komoran_plus_df = pd.DataFrame(komoran_plus)
komoran_plus_byokt_df = pd.DataFrame(komoran_plus_byokt)
mecab_org_df = pd.DataFrame(mecab_org)
mecab_org_byokt_df = pd.DataFrame(mecab_org_byokt)
mecab_plus_df = pd.DataFrame(mecab_plus)
mecab_plus_byokt_df = pd.DataFrame(mecab_plus_byokt)

In [29]:
hannanum_org_df.to_csv("hannanum_org_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
hannanum_org_byokt_df.to_csv("hannanum_org_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
okt_org_df.to_csv("okt_org_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
okt_org_byokt_df.to_csv("okt_org_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
kkma_org_df.to_csv("kkma_org_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
kkma_org_byokt_df.to_csv("kkma_org_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
kkma_plus_df.to_csv("kkma_plus_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
kkma_plus_byokt_df.to_csv("kkma_plus_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
komoran_org_df.to_csv("komoran_org_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
komoran_org_byokt_df.to_csv("komoran_org_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
komoran_plus_df.to_csv("komoran_plus_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
komoran_plus_byokt_df.to_csv("komoran_plus_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
mecab_org_df.to_csv("mecab_org_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
mecab_org_byokt_df.to_csv("mecab_org_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
mecab_plus_df.to_csv("mecab_plus_SY.csv", header=True, index=True, encoding = 'utf-8-sig')
mecab_plus_byokt_df.to_csv("mecab_plus_byokt_SY.csv", header=True, index=True, encoding = 'utf-8-sig')