## データの準備
[Amazon Customer Reviews Dataset](https://s3.amazonaws.com/amazon-reviews-pds/readme.html)で提供されている日本語レビューを利用する

In [None]:
#データのダウンロード：既にダウンロード済みの場合は、改めてダウンロードする必要はない
!wget https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz .

In [3]:
!pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [100]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split
def load_dataset(filename, n = 1000, test_size = 0.2, random_state = 42) :
    df = pd.read_csv(filename, sep='\t') #データ読み込み
    df = df.sample(frac=1, random_state=random_state)  # shuffle
    grouped = df.groupby('star_rating') #各ratingでn件のレビューを得る
    df = grouped.head(n=n)
    X_train, X_test, y_train, y_test = train_test_split(
        df.review_body.values, df.star_rating.values, 
        test_size=test_size, random_state=42)
    
    X_train = np.array( [BeautifulSoup(text, 'html.parser').get_text() for text in X_train ])
    X_test = np.array( [BeautifulSoup(text, 'html.parser').get_text() for text in X_test ])
    return X_train, X_test, y_train, y_test

#以降のパラメタを適切に設定する
path_to_file = "amazon_reviews_multilingual_JP_v1_00.tsv.gz"
n = 1000           #クラスごとのデータ数
test_size = 0.2   #分割割合
random_state = 42 #乱数の種

X_train, X_test, y_train, y_test = load_dataset(path_to_file, n, test_size, random_state)
    
print( "サイズ：訓練例Xy = ({},{})，テスト例Xy = ({},{})".format( len(X_train), len(y_train), len(X_test), len(y_test) ) )

サイズ：訓練例Xy = (4000,4000)，テスト例Xy = (1000,1000)


## TF-IDFによるベクトル化

In [17]:

!pip install mecab-python3==0.996.2 #形態素解析器 Mecabのインストール

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [101]:
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer

class Preprocessor:
  def __init__(self):
    self.pos_filter = [ '名詞', '動詞', '形容詞' ]
    self.tagger = MeCab.Tagger()
    self.tagger.parse("")

  def extract_words(self, text):
    node = self.tagger.parseToNode(text)
    terms = []
    while node:
        term = node.surface
        pos = node.feature.split(',')[0]
        if pos in self.pos_filter:
            terms.append(term)
        node = node.next
    text_result = ' '.join(terms)
    return text_result

def get_tfidf_vectors(train, test, max_features=100, max_df = 0.3) :
  tfidf = TfidfVectorizer(tokenizer=Preprocessor().extract_words, smooth_idf=False, 
                          max_features = max_features, max_df = max_df)
  train_tfidf =  tfidf.fit_transform(train).toarray()
  test_tfidf = tfidf.transform(test).toarray()
  return train_tfidf, test_tfidf

#TFIDFベクトルの獲得
max_features = 100
max_df = 0.3
train_tfidf, test_tfidf = get_tfidf_vectors(X_train, X_test, max_features, max_df)

print( "サイズ：訓練例X = {}，テスト例X = {}".format( train_tfidf.shape, test_tfidf.shape) )

サイズ：訓練例X = (4000, 100)，テスト例X = (1000, 100)


##LDAによるベクトル化

In [None]:
!pip install mecab-python3==0.996.2 

In [102]:
import MeCab
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# TfIdfにおける前処理器と同一
class Preprocessor:
  def __init__(self):
    self.pos_filter = [ '名詞', '動詞', '形容詞' ]
    self.tagger = MeCab.Tagger()
    self.tagger.parse("")

  def extract_words(self, text):
    node = self.tagger.parseToNode(text)
    terms = []
    while node:
        term = node.surface
        pos = node.feature.split(',')[0]
        if pos in self.pos_filter:
            terms.append(term)
        node = node.next
    text_result = ' '.join(terms)
    return text_result

def get_lda_vectors(train, test, n_topics = 10, max_features=100, max_df = 0.3) :
  counter = CountVectorizer(tokenizer=Preprocessor().extract_words,
                          max_features = max_features, max_df = max_df)
  train_bow =  counter.fit_transform(train)
  test_bow = counter.transform(test)
  lda_model = LatentDirichletAllocation( n_components = n_topics)
  train_lda = lda_model.fit_transform( train_bow ) #LDAの実行
  test_lda = lda_model.transform(test_bow)

  return train_lda, test_lda

#LDAベクトルの獲得
n_topics = 10
max_features = 100
max_df = 0.3

train_lda, test_lda = get_lda_vectors(X_train, X_test, n_topics, max_features, max_df)

print( "サイズ：訓練例X = {}，テスト例X = {}".format( train_lda.shape, test_lda.shape) )

サイズ：訓練例X = (4000, 10)，テスト例X = (1000, 10)


## word2vecを利用したベクトル化(SWEM)

In [None]:
#データのダウンロード：既にダウンロード済みの場合は、改めてダウンロードする必要はない
!wget https://github.com/singletongue/WikiEntVec/releases/download/20190520/jawiki.entity_vectors.100d.txt.bz2 .

In [None]:
#必要なライブラリとswemのインストール：インストール済みであれば実行する必要はない
!pip install mecab-python3==0.996.2 #形態素解析器 Mecabのインストール
!git clone https://github.com/yagays/swem.git

In [56]:
import numpy as np
import os
import sys
sys.path.append("swem")

from gensim.models import KeyedVectors
from swem import MeCabTokenizer
from swem import SWEM

#w2v_path = "/path/to/word_embedding.bin" #word2vecのファイルを指定する
w2v_path = "jawiki.entity_vectors.100d.txt.bz2" #自身の環境に合わせて，パスを変更

w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=False) #読み込みに時間がかかります
tokenizer = MeCabTokenizer("-O wakati")

swem = SWEM(w2v, tokenizer)

def get_swem_avg_vectors(train, test):
  train_swemAvg = np.array( [swem.average_pooling(text) for text in train] )
  test_swemAvg = np.array( [swem.average_pooling(text) for text in test] )
  return train_swemAvg, test_swemAvg

def get_swem_max_vectors(train, test):
  train_swemMax = np.array( [swem.max_pooling(text) for text in train] )
  test_swemMax = np.array( [swem.max_pooling(text) for text in test] )
  return train_swemMax, test_swemMax

def get_swem_concat_vectors(train, test):
  train_swemConcat = np.array( [swem.concat_average_max_pooling(text) for text in train] )
  test_swemConcat = np.array( [swem.concat_average_max_pooling(text) for text in test] )
  return train_swemConcat, test_swemConcat

def get_wwem_hier_vectors(train, test, n = 3):
  train_swemHier = np.array( [swem.hierarchical_pooling(text, n=3) for text in train] )
  test_swemHier = np.array( [swem.hierarchical_pooling(text, n=3) for text in test] )
  return train_swemHier, test_swemHier



In [103]:
#swemベクトルの獲得
train_swemAvg, test_swemAvg = get_swem_avg_vectors(X_train, X_test)
print( "サイズ：訓練例X = {}，テスト例X = {}".format( train_swemAvg.shape, test_swemAvg.shape) )

train_swemMax, test_swemMax = get_swem_max_vectors(X_train, X_test)
print( "サイズ：訓練例X = {}，テスト例X = {}".format( train_swemMax.shape, test_swemMax.shape) )

train_swemConcat, test_swemConcat = get_swem_concat_vectors(X_train, X_test)
print( "サイズ：訓練例X = {}，テスト例X = {}".format( train_swemConcat.shape, test_swemConcat.shape) )

n = 3
train_swemHier, test_swemHier = get_wwem_hier_vectors(X_train, X_test, n )
print( "サイズ：訓練例X = {}，テスト例X = {}".format( train_swemHier.shape, test_swemHier.shape) )


サイズ：訓練例X = (4000, 100)，テスト例X = (1000, 100)
サイズ：訓練例X = (4000, 100)，テスト例X = (1000, 100)
サイズ：訓練例X = (4000, 200)，テスト例X = (1000, 200)
サイズ：訓練例X = (4000, 100)，テスト例X = (1000, 100)


## BERTによるベクトル化

In [5]:
!pip install transformers fugashi ipadic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [58]:
from transformers import BertJapaneseTokenizer, BertModel
import pandas as pd
import numpy as np
import torch

#初回実行時にモデルをダウンロードします
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [104]:
# 文書ベクトル獲得メソッド
def get_bert_vectors_batch( model, sentences, max_length = 300) :
  encoded_data = tokenizer.batch_encode_plus(sentences, max_length=max_length, padding=True, return_tensors="pt", truncation=True) 
  input_ids = torch.tensor(encoded_data["input_ids"])

  with torch.no_grad(): # 勾配計算を行わない
    outputs = model( input_ids )
  sentence_vecs = outputs[0][:,0,:] #最終層の重みを獲得
  sentence_vecs = sentence_vecs.to('cpu').detach().numpy().copy() #numpy配列に変換
  return sentence_vecs

def get_sentence_vector( model, text, max_length = 300) :
  #print(text)
  input_ids = tokenizer.encode(text, max_length=max_length, padding=True, return_tensors="pt", truncation=True) 
  with torch.no_grad():
    outputs = model( input_ids )
  text_vec = outputs[0][:,0,:] [0]
  text_vec = text_vec.to('cpu').detach().numpy().copy() 
  return text_vec

def get_bert_vectors(model, train, test, max_length = 300):
  #train_bert = get_bert_vectors_batch(model, train, max_length)
  #test_bert = get_bert_vectors_batch(model, test, max_length)

  train_bert = np.array([get_sentence_vector(model, text, max_length) for text in train])
  test_bert = np.array([get_sentence_vector(model, text, max_length) for text in test])
  return train_bert, test_bert

#Bertベクトルの獲得(かなり時間がかかります。n=1000->全データ数5000で50分間)
train_bert, test_bert = get_bert_vectors(model, X_train, X_test)

print( "サイズ：訓練例X = {}，テスト例X = {}".format( train_bert.shape, test_bert.shape) )


サイズ：訓練例X = (4000, 768)，テスト例X = (1000, 768)


##分類による評価
例としてknn（K-近傍法）を用いて評価を行う  
k：近傍数
explained_variance_ratio：PCAを適用する際の累積寄与率

In [98]:
#KNN実行 & 評価の関数
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

def run_knn(X_train, X_test, y_train, y_test, k = 5, explained_variance_ratio=1) :
  if(explained_variance_ratio < 1) :
    pca = PCA(n_components = explained_variance_ratio)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

  knn_clf = KNeighborsClassifier(n_neighbors = k)
  knn_clf.fit(X_train, y_train)
  y_pred = knn_clf.predict(X_test)

  print(classification_report(y_test, y_pred))
  print('Precision:', precision_score(y_test, y_pred, average=None) )
  print('Recall:', recall_score(y_test, y_pred, average=None) )
  print('F1_score:', f1_score(y_test, y_pred, average=None) )
  print('Accuracy:', accuracy_score(y_test, y_pred) )


In [110]:
train = train_bert #
test = test_bert   #
k = 20
explained_variance_ratio = 0.8 #1にセットするとPCAを適用しない

run_knn(train, test, y_train, y_test, k, explained_variance_ratio) #

              precision    recall  f1-score   support

           1       0.43      0.57      0.49       203
           2       0.28      0.36      0.31       186
           3       0.26      0.21      0.23       204
           4       0.34      0.31      0.32       199
           5       0.52      0.35      0.42       208

    accuracy                           0.36      1000
   macro avg       0.36      0.36      0.36      1000
weighted avg       0.37      0.36      0.36      1000

Precision: [0.42592593 0.2780083  0.26060606 0.33695652 0.52142857]
Recall: [0.56650246 0.36021505 0.21078431 0.31155779 0.35096154]
F1_score: [0.48625793 0.31381733 0.23306233 0.32375979 0.41954023]
Accuracy: 0.36
