# 텍스트 데이터 - 한글

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 데이터 읽기
ratings_train = pd.read_csv('data-files/ratings_train.txt', sep='\t')   # 구분자 탭
ratings_test = pd.read_csv('data-files/ratings_test.txt', sep='\t')   # 구분자 탭

In [3]:
# 데이터 확인
ratings_train.head(2), ratings_test.head(2)
print( ratings_train.shape, ratings_test.shape )
print( ratings_train['label'].mean(), ratings_test['label'].mean() )
print( np.unique(ratings_train['label'], return_counts=True), np.unique(ratings_test['label'], return_counts=True))



(150000, 3) (50000, 3)
0.49884666666666666 0.50346
(array([0, 1]), array([75173, 74827])) (array([0, 1]), array([24827, 25173]))


In [4]:
# 데이터 확인 2 --> missing value detected
ratings_train.info()
ratings_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        150000 non-null  int64 
 1   document  149995 non-null  object
 2   label     150000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [5]:
# 결측값 처리

na_mask = ratings_train['document'].isna()
ratings_na_removed_train = ratings_train[~na_mask].copy()   # ~ : not, & : and, | : or

na_mask = ratings_test['document'].isna()
ratings_na_removed_test = ratings_test[~na_mask].copy()

In [6]:
# 데이터 전처리 1 : 한글 문자와 영문자만 보존

# ratings_na_removed_train.tail()
import re

ratings_na_removed_train['document'] = \
    ratings_na_removed_train['document'].map(lambda v: re.sub("[^A-Za-zㄱ-힣]", ' ', v))
ratings_na_removed_test['document'] = \
    ratings_na_removed_test['document'].map(lambda v: re.sub("[^A-Za-zㄱ-힣]", ' ', v))

In [7]:
# 한국어 처리 패키지 설치
# !pip install konlpy

In [None]:
# 형태소 분해 : 문장 -> 형태소 리스트

from konlpy.tag import Okt  # 한국어 사전 준비 (형태소 분할에 사용)

okt = Okt()
print( okt.morphs(ratings_na_removed_train['document'][0]) )

ratings_na_removed_train['document'] = ratings_na_removed_train['document'].map(okt.morphs)
ratings_na_removed_test['document'] = ratings_na_removed_test['document'].map(okt.morphs)


['아', '더빙', '진짜', '짜증나네요', '목소리']


In [None]:
ratings_na_removed_train.to_csv('data-files/ratings_train2.csv', index=False)
ratings_na_removed_test.to_csv('data-files/ratings_test2.csv', index=False)
# 원데이터는 리스트였는데, csv 저장 시 문자열로 저장함 -> 파일 불러온 후 리스트로 바꿔야함

In [2]:
# 데이터 다시 읽어오기
ratings_train2 = pd.read_csv('data-files/ratings_train2.csv')
ratings_test2 = pd.read_csv('data-files/ratings_test2.csv')

In [3]:
# 문제 확인 : document 컬럼이 문자열 형식으로 저장됨 -> 리스트로 변경 필요
ratings_train2.head(1), ratings_test2.head(1)
ratings_train2.info(), ratings_test2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149995 entries, 0 to 149994
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        149995 non-null  int64 
 1   document  149995 non-null  object
 2   label     149995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49997 entries, 0 to 49996
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        49997 non-null  int64 
 1   document  49997 non-null  object
 2   label     49997 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


(None, None)

In [4]:
# document 컬럼 자료형 변경 : string -> list
print("['123', '456', '789']"[1])   # 문자열
print(['123', '456', '789'][1])     # 리스트
print(eval("['123', '456', '789']")[1])   # eval : 문자열 -> 파이썬 코드

ratings_train2['document'] = ratings_train2['document'].map(eval)
ratings_test2['document'] = ratings_test2['document'].map(eval)


'
456
456


In [5]:
# 변환 확인
type(ratings_train2['document'][0]), type(ratings_test2['document'][0])

(list, list)

In [6]:
# token 리스트 -> 단일 문자열 > 이렇게 해야 카운트 할 수 있음
ratings_train2['document'] = ratings_train2['document'].str.join(' ')
ratings_test2['document'] = ratings_test2['document'].str.join(' ')

In [7]:
# 전처리 결과 파일에 저장
ratings_train2.to_csv("data-files/processed_ratings_train.csv", index=False)
ratings_test2.to_csv("data-files/processed_ratings_test.csv", index=False)

In [8]:
# 문자열 -> 숫자 ( encoding )
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

cv = CountVectorizer() # 원데이터 사용
cv_train_counts = cv.fit_transform(ratings_train2['document'])
cv_test_counts = cv.transform(ratings_test2['document'])

tv = TfidfVectorizer() # 원데이터 사용
tv_train_counts = tv.fit_transform(ratings_train2['document'])
tv_test_counts = tv.transform(ratings_test2['document'])

tt = TfidfTransformer() # 한 번 처리된 CountVectorizer를 사용
tt_train_counts = tt.fit_transform(cv_train_counts)
tt_test_counts = tt.transform(cv_test_counts)

In [9]:
# print( cv_train_counts )
# (149995, 98323) > 149995행, 98323 단어
# cv_train_counts.toarray()   # 메모리 이유로 불가능 (110. GiB 필요)
# print( cv.vocabulary_[:10] )

In [17]:
ratings_train3 = ratings_train2.dropna()
ratings_test3 = ratings_test2.dropna()
ratings_train3.info(), ratings_test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149995 entries, 0 to 149994
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        149995 non-null  int64 
 1   document  149995 non-null  object
 2   label     149995 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49997 entries, 0 to 49996
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        49997 non-null  int64 
 1   document  49997 non-null  object
 2   label     49997 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


(None, None)

In [22]:
print( len(cv.vocabulary_), type(cv.vocabulary_) )
type( cv.vocabulary_.items() )
print( list(cv.vocabulary_.keys())[:10] )

98323 <class 'dict'>
['더빙', '진짜', '짜증나네요', '목소리', '포스터', '보고', '초딩', '영화', '오버', '연기']


In [30]:
# 모델 훈련 및 평가
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(tv_train_counts, ratings_train3['label'])

print( logreg.score(tv_train_counts, ratings_train3['label']) )
print( logreg.score(tv_test_counts, ratings_test3['label']) )

0.8856695223174106
0.8390303418205093
