In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [3]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [4]:
# tf-idf 값으로 백터화 진행
'''
min_df : 설정한 값보다 특정 토큰의 df값이 더 적게 나오면 백터화 과정에서 제거
analyzer : 분석하기 위한 기준 단위 (word: 단어기준 ,char: 문자 기준)
sublinear_tf : 스무딩 여부를 설정
ngram_range : 단어 묶음 범위 설정
max_features : 벡터의 최대 길이 설정
'''

vectorizer = TfidfVectorizer(min_df=0.0, analyzer='char', sublinear_tf=True,ngram_range=(1,3),max_features=5000)

X = vectorizer.fit_transform(reviews)
Y = np.array(sentiments)

In [5]:
X

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 17862871 stored elements in Compressed Sparse Row format>

In [6]:
Y

array([1, 1, 0, ..., 0, 0, 1])

In [7]:
features = vectorizer.get_feature_names_out()

In [8]:
X_train,X_eval,y_train,y_eval = train_test_split(X,Y,test_size=TEST_SPLIT,random_state=RANDOM_SEED)

In [9]:
lgs = LogisticRegression(class_weight='balanced')   # 각 레이블에 대해 균형 있게 학습하도록 하기 위함
lgs.fit(X_train,y_train)

In [10]:
predicted = lgs.predict(X_eval)

In [11]:
print('Accuracy: %f'% lgs.score(X_eval,y_eval))

Accuracy: 0.859800


In [14]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)
test_data

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,"""12311_10"""
1,movie disaster within disaster film full great...,"""8348_2"""
2,movie kids saw tonight child loved one point k...,"""5828_4"""
3,afraid dark left impression several different ...,"""7186_2"""
4,accurate depiction small time mob life filmed ...,"""12128_7"""
...,...,...
24995,sony pictures classics looking sony got rights...,"""2155_10"""
24996,always felt ms merkerson never gotten role fit...,"""59_10"""
24997,disappointed movie familiar case read mark fuh...,"""2531_1"""
24998,opening sequence filled black white shots remi...,"""7772_8"""


In [15]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
answer_dataset = pd.DataFrame({'id':test_data['id'],'review':test_data['review']})
answer_dataset

Unnamed: 0,id,review
0,"""12311_10""",naturally film main themes mortality nostalgia...
1,"""8348_2""",movie disaster within disaster film full great...
2,"""5828_4""",movie kids saw tonight child loved one point k...
3,"""7186_2""",afraid dark left impression several different ...
4,"""12128_7""",accurate depiction small time mob life filmed ...
...,...,...
24995,"""2155_10""",sony pictures classics looking sony got rights...
24996,"""59_10""",always felt ms merkerson never gotten role fit...
24997,"""2531_1""",disappointed movie familiar case read mark fuh...
24998,"""7772_8""",opening sequence filled black white shots remi...
