- 라이브러리

In [34]:
import pandas as pd
import numpy as np
import re

- 데이터 로드

In [17]:
def load_data(dir):

    df = pd.read_table(dir)
    
    return df

In [18]:
train = load_data(r'C:\Project\sw-grad-proj\data\ratings_train.txt')
test = load_data(r'C:\Project\sw-grad-proj\data\ratings_test.txt')

- 데이터 확인

In [6]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


In [7]:
test.head(2)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0


In [10]:
train['label'].value_counts() / len(train)

0    0.501153
1    0.498847
Name: label, dtype: float64

In [11]:
test['label'].value_counts() / len(test)

1    0.50346
0    0.49654
Name: label, dtype: float64

In [9]:
# train, test : 결측치 있음
print('train null : ')
print(train.isnull().sum(), '\n')
print('test null : ')
print(test.isnull().sum())

train null : 
id          0
document    5
label       0
dtype: int64 

test null : 
id          0
document    3
label       0
dtype: int64


- 데이터 중복 제거

In [19]:
def drop_duplicates(df, colname):

    df.drop_duplicates(subset=[colname], inplace=True)

    return df

In [20]:
# train : 중복 데이터 있음
print(train.shape)
train = drop_duplicates(train, 'document')
print(train.shape)

(150000, 3)
(146183, 3)


In [21]:
# test : 중복 데이터 있음
print(test.shape)
test = drop_duplicates(test, 'document')
print(test.shape)

(50000, 3)
(49158, 3)


- 결측치 제거

In [23]:
def drop_null(df):

    df.dropna(inplace=True)

    return df

In [24]:
# train : 결측치 있음
print(train.shape)
train = drop_null(train)
print(train.shape)

(146183, 3)
(146182, 3)


In [26]:
# test : 결측치 있음
print(test.shape)
test = drop_null(test)
print(test.shape)

(49158, 3)
(49157, 3)


In [27]:
train.isnull().sum()

id          0
document    0
label       0
dtype: int64

In [28]:
test.isnull().sum()

id          0
document    0
label       0
dtype: int64

- 텍스트 클렌징

In [35]:
def text_cleansing(text):

    hangul = re.compile('[^ ㄱ-ㅣㅏ-ㅣ가-힣]+') # 한글이 아닌 텍스트를 찾음
    
    return hangul.sub('', text).split() # .sub(치환할 문자열, target text)

In [36]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


In [37]:
train['document'] = train['document'].apply(lambda x : text_cleansing(x))

In [39]:
test['document'] = test['document'].apply(lambda x : text_cleansing(x))

In [38]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,"[아, 더빙, 진짜, 짜증나네요, 목소리]",0
1,3819312,"[흠포스터보고, 초딩영화줄오버연기조차, 가볍지, 않구나]",1


- 불용어 제거

In [40]:
def del_stopwords(text):

    # 불용어
    stopwords = ["도", "는", "다", "의", "가", "이", "은", "한", "에", "하", "고", "을", "를", "인", "듯", "과", "와", "네", "들", "듯", "지", "임", "게"]
    # 불용어 제거
    results = [text[i] for i in range(len(text)) if text[i] not in stopwords]

    return results

In [41]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,"[아, 더빙, 진짜, 짜증나네요, 목소리]",0
1,3819312,"[흠포스터보고, 초딩영화줄오버연기조차, 가볍지, 않구나]",1


In [42]:
train['document'] = train['document'].apply(lambda x : del_stopwords(x))

In [43]:
test['document'] = test['document'].apply(lambda x : del_stopwords(x))

In [44]:
train.head(2)

Unnamed: 0,id,document,label
0,9976970,"[아, 더빙, 진짜, 짜증나네요, 목소리]",0
1,3819312,"[흠포스터보고, 초딩영화줄오버연기조차, 가볍지, 않구나]",1
