In [1]:
from pathlib import Path
import re

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# 前処理: Train data
datapath = Path('data/Amazon_review/Training_data')
review_pattern = re.compile(r'cv\d+')  # ファイル名が "cv数字"　で始まるファイル名かを調べる正規表現
test_size = 0.25

data_orig  = dict(neg=[], pos=[])
data_train = dict(neg=[], pos=[])
data_verify  = dict(neg=[], pos=[])

np.random.seed(539167)

for cls, reviews in data_orig.items():
    for path in (datapath / cls).iterdir():
        if review_pattern.match(path.name):
            with open(path, 'r', encoding='latin') as src:
                reviews.append(src.read())
    print(f"{cls:>8}: loaded {len(reviews)} reviews.")
    
    data_train[cls], data_verify[cls] = train_test_split(reviews, test_size=test_size, random_state=539167)

     neg: loaded 700 reviews.
     pos: loaded 700 reviews.


In [3]:
# 前処理: Test data

datapath_test = Path('data/Amazon_review/Test_data')
review_test_pattern = re.compile(r'amazon_review_\d+')
data_test  = dict(neg=[], pos=[])

for cls, reviews in data_test.items():
    for path in (datapath_test / cls).iterdir():
        if review_test_pattern.match(path.name):
            with open(path, 'r', encoding='latin') as src:
                reviews.append(src.read())
    print(f"{cls:>8}: loaded {len(reviews)} reviews(test_data).")

     neg: loaded 3 reviews(test_data).
     pos: loaded 3 reviews(test_data).


In [4]:
# 前処理: 特徴量作成

def get_values_and_targets(data):
    values = data['pos'] + data['neg']
    target = [True]*len(data['pos']) + [False]*len(data['neg']) 
    target = np.array(target)
    return values, target
values_train, is_pos_train = get_values_and_targets(data_train)

vocab = CountVectorizer(token_pattern=r'[a-zA-Z]{3,}')
vocab.fit([data_orig['pos'][0]])
features = vocab.fit_transform(values_train)

print(features.shape)

(1050, 30989)


In [5]:
# Fitting

model = LogisticRegression(solver='saga', max_iter=3000, random_state=539167)
model.fit(features, is_pos_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=539167, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [6]:
# Train dataへの検証

values_verify, is_pos_verify = get_values_and_targets(data_verify)
pred_verify = model.predict(vocab.transform(values_verify))

validation = (pred_verify == is_pos_verify)
size    = validation.size
correct = np.count_nonzero(validation)
print(f"{correct}/{size} correct ({correct*100/size:.3f}%)")

284/350 correct (81.143%)


In [12]:
# Test data検証

values_test, is_pos_test = get_values_and_targets(data_test)
pred_test = model.predict(vocab.transform(values_test))

for cls in ('pos', 'neg'):
    _values  = data_test[cls]
    _is_pos = [(cls == 'pos')]*len(_values)
    _pred    = model.predict(vocab.transform(_values))
    _valid   = (_pred == _is_pos)
    _size    = _valid.size
    _correct = np.count_nonzero(_valid)
    print(f"{cls}: {_correct:>3d}/{_size:>3d} correct ({_correct*100/_size:.3f}%)")

print()
validation = (pred_test == is_pos_test)
size    = validation.size
correct = np.count_nonzero(validation)
print(f"Total: {correct}/{size} correct ({correct*100/size:.3f}%)")

pos:   2/  3 correct (66.667%)
neg:   1/  3 correct (33.333%)

Total: 3/6 correct (50.000%)
