<a href="https://colab.research.google.com/github/AI-Tiger/ml-project/blob/main/09_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x

In [2]:
!pip install janome beautifulsoup4

Collecting janome
[?25l  Downloading https://files.pythonhosted.org/packages/a8/63/98858cbead27df7536c7e300c169da0999e9704d02220dc6700b804eeff0/Janome-0.4.1-py2.py3-none-any.whl (19.7MB)
[K     |████████████████████████████████| 19.7MB 21.7MB/s 
Installing collected packages: janome
Successfully installed janome-0.4.1


In [3]:
!mkdir data
!mkdir models
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz -P data/

--2021-05-10 08:45:51--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1279641604 (1.2G) [binary/octet-stream]
Saving to: ‘data/cc.ja.300.vec.gz’


2021-05-10 08:47:05 (16.5 MB/s) - ‘data/cc.ja.300.vec.gz’ saved [1279641604/1279641604]



In [4]:
import string

import gensim
import numpy as np
import pandas as pd
import tensorflow as tf
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, Embedding, SimpleRNN, LSTM, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
maxlen = 300
num_words = 40000
num_label = 2

In [6]:
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold

def load_dataset(filename, n=5000, state=6):
    df = pd.read_csv(filename, sep='\t')

    # Converts multi-class to binary-class.
    mapping = {1: 0, 2: 0, 4: 1, 5: 1}
    df = df[df.star_rating !=3]
    df.star_rating = df.star_rating.map(mapping)

    # extracts Japanese texts:
    is_jp = df.review_body.apply(filter_by_ascii_rate)
    df = df[is_jp]

    # sampling.
    df = df.sample(frac=1, random_state=state)
    grouped = df.groupby('star_rating')
    df = grouped.head(n=n)
    return df.review_body.values, df.star_rating.values

In [7]:
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz'
x, y = load_dataset(url)

In [8]:
def load_fasttext(filepath, binary=False):
    """Loads fastText vectors.

    Args:
        filepath (str): a path to a fastText file.

    Return:
        model: KeyedVectors
    """
    model = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=binary)
    return model


wv = load_fasttext('/content/data/cc.ja.300.vec.gz')

In [9]:
t = Tokenizer(wakati=True)


def build_vocabulary(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=num_words, oov_token='<UNK>'
    )
    tokenizer.fit_on_texts(texts)
    return tokenizer


def clean_html(html, strip=False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip=strip)
    return text


def tokenize(text):
    return t.tokenize(text)


def preprocess_dataset(texts):
    texts = [clean_html(text) for text in texts]
    texts = [' '.join(tokenize(text)) for text in texts]
    return texts


def filter_embeddings(embeddings, vocab, num_words, dim=300):
  """Filter word vectors.

  Args:
      embeddings: a dictionary like object.
      vocab: word-index lookup table.
      num_words: the number of words.
      dim: dimension.

  Returns:
      numpy array: an array of word embeddings.
  """
  _embeddings = np.zeros((num_words, dim))
  for word in vocab:
      if word in embeddings:
          word_id = vocab[word]
          if word_id >= num_words:
              continue
          _embeddings[word_id] = embeddings[word]

  return _embeddings

In [10]:
x = preprocess_dataset(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
vocab = build_vocabulary(x_train, num_words)
x_train = vocab.texts_to_sequences(x_train)
x_test = vocab.texts_to_sequences(x_test)
x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post', padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post', padding='post')

wv = filter_embeddings(wv, vocab.word_index, num_words)

In [11]:
class RNNModel:

    def __init__(self, input_dim, output_dim,
                 emb_dim=300, hid_dim=100,
                 embeddings=None, trainable=True):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       trainable=trainable,
                                       name='embedding')
            
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       trainable=trainable,
                                       weights=[embeddings],
                                       name='embedding')
        self.rnn = SimpleRNN(hid_dim, name='rnn')
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        output = self.rnn(embedding)
        y = self.fc(output)
        return Model(inputs=x, outputs=y)

In [12]:
class LSTMModel:

    def __init__(self, input_dim, output_dim,
                 emb_dim=300, hid_dim=100,
                 embeddings=None, trainable=True):
        self.input = Input(shape=(None,), name='input')
        if embedding is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       trainable=trainable,
                                       name='embedding')
            
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       trainable=trainable,
                                       weights=[embeddings],
                                       name='embedding')
        self.lstm = LSTM(hid_dim, name='lstm')
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        output = self.lstm(embedding)
        y = self.fc(output)
        return Model(inputs=x, outputs=y)

In [13]:
class CNNModel:
    def __init__(self, input_dim, output_dim,
                 filters=250, kernel_size=3,
                 emb_dim=300, embeddings=None, trainable=True):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       trainable=trainable,
                                       name='embedding')
            
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       trainable=trainable,
                                       weights=[embeddings],
                                       name='embedding')
        
        self.conv = Conv1D(filters,
                           kernel_size,
                           padding='valid',
                           activation='relu',
                           strides=1)
        self.pool = GlobalMaxPooling1D()
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        conv = self.conv(embedding)
        pool = self.pool(conv)
        y = self.fc(pool)
        return Model(inputs=x, outputs=y)

In [14]:
models = [
          RNNModel,
          LSTMModel,
          CNNModel,
          CNNModel
]

In [15]:
model_path = 'models/model_{}'
embeddings = [None, None, None, wv]
batch_size = 128
epochs = 100
i = 0
for model, embedding in zip(models, embeddings):
    tf.keras.backend.clear_session()
    model = model(num_words, num_label, embeddings=embedding).build()
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['acc']
    )

    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path.format(i), save_best_only=True)
    ]

    model.fit(
        x=x_train, y=y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2,
        callbacks=callbacks,
        shuffle=True
    )
    i += 1

Epoch 1/100
INFO:tensorflow:Assets written to: models/model_0/assets
Epoch 2/100
INFO:tensorflow:Assets written to: models/model_0/assets
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 1/100




INFO:tensorflow:Assets written to: models/model_1/assets


INFO:tensorflow:Assets written to: models/model_1/assets


Epoch 2/100




INFO:tensorflow:Assets written to: models/model_1/assets


INFO:tensorflow:Assets written to: models/model_1/assets


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 1/100
INFO:tensorflow:Assets written to: models/model_2/assets


INFO:tensorflow:Assets written to: models/model_2/assets


Epoch 2/100
INFO:tensorflow:Assets written to: models/model_2/assets


INFO:tensorflow:Assets written to: models/model_2/assets


Epoch 3/100
INFO:tensorflow:Assets written to: models/model_2/assets


INFO:tensorflow:Assets written to: models/model_2/assets


Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 1/100
INFO:tensorflow:Assets written to: models/model_3/assets


INFO:tensorflow:Assets written to: models/model_3/assets


Epoch 2/100
INFO:tensorflow:Assets written to: models/model_3/assets


INFO:tensorflow:Assets written to: models/model_3/assets


Epoch 3/100
INFO:tensorflow:Assets written to: models/model_3/assets


INFO:tensorflow:Assets written to: models/model_3/assets


Epoch 4/100
INFO:tensorflow:Assets written to: models/model_3/assets


INFO:tensorflow:Assets written to: models/model_3/assets


Epoch 5/100
Epoch 6/100
Epoch 7/100


In [16]:
class InferenceAPI:
    """A model API that generates output sequence.

    Attributes:
        model: Model.
        vocab: language's vocabulary.
    """

    def __init__(self, model, vocal, preprocess):
        self.model = model
        self.vocab = vocab
        self.preprocess = preprocess

    def predict_from_texts(self, texts):
        x = self.preprocess(texts)
        x = self.vocab.texts_to_sequences(x)
        return self.predict_from_sequences(x)

    def predict_from_sequences(self, sequences):
        sequences = pad_sequences(sequences, truncating='post')
        y = self.model.predict(sequences)
        return np.argmax(y, -1)

In [17]:
model_names = ['RNN', 'LSTM', 'CNN', 'CNN(wv)']
for i, model_name in enumerate(model_names):
    tf.keras.backend.clear_session()
    model = load_model(model_path.format(i))
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequences(x_test)
    print(model_name)
    print('precision\t: {:.4f}'.format(precision_score(y_test, y_pred, average='binary')))
    print('recall\t: {:.4f}'.format(recall_score(y_test, y_pred, average='binary')))
    print('f1\t: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
    print()

RNN
precision	: 0.7312
recall	: 0.7146
f1	: 0.7228

LSTM
precision	: 0.8709
recall	: 0.7889
f1	: 0.8279

CNN
precision	: 0.8919
recall	: 0.8097
f1	: 0.8488

CNN(wv)
precision	: 0.8686
recall	: 0.8385
f1	: 0.8533

