In [1]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [3]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

In [4]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [5]:
sentences = []
for review in reviews:
    sentences.append(review.split())

In [7]:
num_features = 300    # 단어 임베딩 차원수
min_word_count = 40   # 단어에 대한 최소 빈도 수
num_workers = 4       # 프로세스 개수
context = 10          # word2vec을 수행하기 위한 context window 크기
downsampling = 1e-3   # 빠른 학습을 위해, 정답 단어 라벨에 대한 다운샘플링 비율

In [8]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
   level=logging.INFO)

## word2vec

In [10]:
import gensim

In [20]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, \
           vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

2022-03-13 19:18:41,239 : INFO : collecting all words and their counts
2022-03-13 19:18:41,240 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-03-13 19:18:41,699 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2022-03-13 19:18:42,242 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2022-03-13 19:18:42,487 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2022-03-13 19:18:42,488 : INFO : Creating a fresh vocabulary
2022-03-13 19:18:42,588 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 8160 unique words (11.017349625329103%% of original 74065, drops 65905)', 'datetime': '2022-03-13T19:18:42.587436', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'prepare_vocab'}
2022-03-13 19:18:42,589 : INFO : Word2Vec lifec

In [24]:
# 모델의 이름에 hyperparameter 이름을 넣어주면 나중에 참고하기 좋다.
# 모델을 저장하면, 나중에 Wor2Vec.load()을 통해 모델을 다시 사용할 수 있다.
model_name = "300features_40minwords_10context"
model.save(model_name)

2022-03-13 19:21:19,316 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-13T19:21:19.316071', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'saving'}
2022-03-13 19:21:19,318 : INFO : not storing attribute cum_table
2022-03-13 19:21:19,353 : INFO : saved 300features_40minwords_10context


In [28]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)

    num_words = 0
    index_to_key_set = set(model.wv.index_to_key)

    for w in words:
        if w in index_to_key_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[w])

    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [29]:
def get_dataset(reviews, model, num_features):
    dataset = list()

    for s in reviews:
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [30]:
test_data_vecs = get_dataset(sentences, model, num_features)

## 모델 학습

In [31]:
from sklearn.model_selection import train_test_split
import numpy as np

X = test_data_vecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [32]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')

In [33]:
print("Accuracy: %f" % lgs.score(X_test, y_test)) 

Accuracy: 0.867600


## 제출

In [34]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

test_review = list(test_data['review'])

In [35]:
test_data.head(5)

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,"""12311_10"""
1,movie disaster within disaster film full great...,"""8348_2"""
2,movie kids saw tonight child loved one point k...,"""5828_4"""
3,afraid dark left impression several different ...,"""7186_2"""
4,accurate depiction small time mob life filmed ...,"""12128_7"""


In [36]:
test_sentences = list()
for review in test_review:
    test_sentences.append(review.split())

In [37]:
test_data_vecs = get_dataset(test_sentences, model, num_features)

In [38]:
DATA_OUT_PATH = './data_out/'

test_predicted = lgs.predict(test_data_vecs)

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
ids = list(test_data['id'])
answer_dataset = pd.DataFrame({'id': ids, 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv', index=False, quoting=3)