- 라이브러리

In [149]:
import pandas as pd
import numpy as np
import re

from konlpy.tag import Mecab
import MeCab
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

- 데이터 로드

In [96]:
def load_data(dir):

    df = pd.read_table(dir)
    
    return df

In [175]:
train = load_data(r'C:\Project\sw-grad-proj\data\ratings_train.txt')
test = load_data(r'C:\Project\sw-grad-proj\data\ratings_test.txt')

- 데이터 확인

In [98]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


In [99]:
test.head(2)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0


In [100]:
train['label'].value_counts() / len(train)

0    0.501153
1    0.498847
Name: label, dtype: float64

In [101]:
test['label'].value_counts() / len(test)

1    0.50346
0    0.49654
Name: label, dtype: float64

In [102]:
# train, test : 결측치 있음
print('train null : ')
print(train.isnull().sum(), '\n')
print('test null : ')
print(test.isnull().sum())

train null : 
id          0
document    5
label       0
dtype: int64 

test null : 
id          0
document    3
label       0
dtype: int64


- 데이터 중복 제거

In [103]:
def drop_duplicates(df, colname):

    df.drop_duplicates(subset=[colname], inplace=True)

    return df

In [176]:
# train : 중복 데이터 있음
print(train.shape)
train = drop_duplicates(train, 'document')
print(train.shape)

(150000, 3)
(146183, 3)


In [177]:
# test : 중복 데이터 있음
print(test.shape)
test = drop_duplicates(test, 'document')
print(test.shape)

(50000, 3)
(49158, 3)


- 결측치 제거

In [106]:
def drop_null(df):

    df.dropna(inplace=True)

    return df

In [178]:
# train : 결측치 있음
print(train.shape)
train = drop_null(train)
print(train.shape)

(146183, 3)
(146182, 3)


In [179]:
# test : 결측치 있음
print(test.shape)
test = drop_null(test)
print(test.shape)

(49158, 3)
(49157, 3)


In [109]:
train.isnull().sum()

id          0
document    0
label       0
dtype: int64

In [110]:
test.isnull().sum()

id          0
document    0
label       0
dtype: int64

- 텍스트 클렌징

In [111]:
def text_cleansing(text):

    hangul = re.compile('[^ ㄱ-ㅣㅏ-ㅣ가-힣]+') # 한글이 아닌 텍스트를 찾음
    
    return hangul.sub('', text) # .sub(치환할 문자열, target text)

In [112]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


In [180]:
train['document'] = train['document'].apply(lambda x : text_cleansing(x))

In [181]:
test['document'] = test['document'].apply(lambda x : text_cleansing(x))

In [115]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1


- 토큰화

In [30]:
# MeCab sample
mecab = MeCab.Tagger()
out = mecab.parse('오늘은 무엇을 해볼까?')
print(out)

오늘	NNG,*,T,오늘,*,*,*,*
은	JX,*,T,은,*,*,*,*
무엇	NP,*,T,무엇,*,*,*,*
을	JKO,*,T,을,*,*,*,*
해	VV+EC,*,F,해,Inflect,VV,EC,하/VV/*+아/EC/*
볼까	VX+EF,*,F,볼까,Inflect,VX,EF,보/VX/*+ᆯ까/EF/*
?	SF,*,*,*,*,*,*,*
EOS



In [29]:
# MeCab sample
mecab = Mecab(r'C:\mecab\mecab-ko-dic')
out = mecab.pos('오늘은 무엇을 해볼까?')
print(out)

[('오늘', 'NNG'), ('은', 'JX'), ('무엇', 'NP'), ('을', 'JKO'), ('해', 'VV+EC'), ('볼까', 'VX+EF'), ('?', 'SF')]


In [28]:
# MeCab sample
mecab = Mecab(r'C:\mecab\mecab-ko-dic')
out = mecab.morphs('오늘은 무엇을 해볼까?')
print(type(out))
print(out)


<class 'list'>
['오늘', '은', '무엇', '을', '해', '볼까', '?']


In [116]:
# 토큰화
def text_tokenize(text):

    mecab = Mecab(r'C:\mecab\mecab-ko-dic')
    out = mecab.morphs(text)

    return out

In [117]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1


In [182]:
train = train[:10]
print(train.shape) # sample result

test = test[:10]
print(test.shape)

(10, 3)
(10, 3)


In [183]:
train['document'] = train['document'].apply(lambda x : text_tokenize(x))

In [184]:
test['document'] = test['document'].apply(lambda x : text_tokenize(x))

In [123]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,"[아, 더, 빙, 진짜, 짜증, 나, 네요, 목소리]",0
1,3819312,"[흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 조차, 가볍, 지, 않, 구나]",1


- 불용어 제거

In [124]:
def del_stopwords(text):

    # 불용어
    stopwords = ["도", "는", "다", "의", "가", "이", "은", "한", "에", "하", "고", "을", "를", "인", "듯", "과", "와", "네", "들", "듯", "지", "임", "게"]
    # 불용어 제거
    results = [text[i] for i in range(len(text)) if text[i] not in stopwords]

    return results

In [125]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,"[아, 더, 빙, 진짜, 짜증, 나, 네요, 목소리]",0
1,3819312,"[흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 조차, 가볍, 지, 않, 구나]",1


In [185]:
train['document'] = train['document'].apply(lambda x : del_stopwords(x))

In [186]:
test['document'] = test['document'].apply(lambda x : del_stopwords(x))

In [127]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,"[아, 더, 빙, 진짜, 짜증, 나, 네요, 목소리]",0
1,3819312,"[흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 조차, 가볍, 않, 구나]",1


- 벡터화 (tf-idf)

In [187]:
def encoder_tf(df, colname):

    df[colname] = df[colname].apply(lambda x : ' '.join(x))

    tfvec = TfidfVectorizer()
    out = tfvec.fit_transform(df[colname])

    # tfvec = encoder
    with open(r'C:\Project\sw-grad-proj\result\tfvec.pkl', 'wb') as f:
        pickle.dump(tfvec, f)

    return out # out = X_tr ecoding result

In [198]:
def encoding_tf(df, colname):

    df[colname] = df[colname].apply(lambda x : ' '.join(x))

    with open(r'C:\Project\sw-grad-proj\result\tfvec.pkl', 'rb') as f:
        tfvec = pickle.load(f)
        
    out = tfvec.transform(df[colname])

    return out # out = X_te ecoding result

In [189]:
# encoder using train data
encoder_tf(train, 'document')

<10x73 sparse matrix of type '<class 'numpy.float64'>'
	with 81 stored elements in Compressed Sparse Row format>

In [201]:
# encoding with encoder
X_te = encoding_tf(test, 'document')

In [200]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더 빙 진짜 짜증 나 네요 목소리,0
1,3819312,흠 포스터 보고 초딩 영화 줄 오버 연기 조차 가볍 않 구나,1
