# Hate Filter
I hope all SNS companies (FaceBook, Twitter, Line, etc.) implemnts Hate filter in every languages.  I don't have enough training sets.  But I wanted to prove this works in Japanese.

## Japanese word extracter using MeCab (Japanese Morphological Analysis)

In [106]:
import MeCab
from sklearn.feature_extraction.text import CountVectorizer

class WordDividor:
    INDEX_CATEGORY = 0
    INDEX_ROOT_FORM = 6
    # "Noun", "Verb", "Adjective", "Adverb", "Adonominal", "Emotive Verv"
    TARGET_CATEGORIES = ["名詞", " 動詞",  "形容詞", "副詞", "連体詞", "感動詞"]

    def __init__(self, dictionary="mecabrc"):
        self.dictionary = dictionary
        self.tagger = MeCab.Tagger(self.dictionary)

    def extract_words(self, text):
        if not text:
            return []

        words = []

        node = self.tagger.parseToNode(text)
        while node:
            features = node.feature.split(',')

            if features[self.INDEX_CATEGORY] in self.TARGET_CATEGORIES:
                #print(str(features))
                if features[self.INDEX_ROOT_FORM] == "*":
                    words.append(node.surface)
                else:
                    # prefer root form
                    words.append(features[self.INDEX_ROOT_FORM])

            node = node.next

        return words

## loading senteces from file
Not like sentiments analysis, one training set has only one sentence.

In [107]:
import os
import csv
import sys
import shutil
from collections import namedtuple
from os import environ, listdir, makedirs
from os.path import dirname, exists, expanduser, isdir, join, splitext
import hashlib

from sklearn.datasets.base import Bunch

import numpy as np

def load_sentence_files(container_path, description=None, categories=None,
               encoding=None,
               decode_error='strict', random_state=0):
    """Load text files with categories as subfolder names.
    Individual samples are assumed to be files stored a two levels folder
    structure such as the following:
        container_folder/
            category_1_folder/
                file_1.txt
                file_2.txt
                ...
                file_42.txt
            category_2_folder/
                file_43.txt
                file_44.txt
                ...
    The folder names are used as supervised signal label names. The individual
    file names are not important.
    This function does not try to extract features into a numpy array or scipy
    sparse matrix. In addition, if load_content is false it does not try to
    load the files in memory.
    To use text files in a scikit-learn classification or clustering algorithm,
    you will need to use the `sklearn.feature_extraction.text` module to build
    a feature extraction transformer that suits your problem.
    If you set load_content=True, you should also specify the encoding of the
    text using the 'encoding' parameter. For many modern text files, 'utf-8'
    will be the correct encoding. If you leave encoding equal to None, then the
    content will be made of bytes instead of Unicode, and you will not be able
    to use most functions in `sklearn.feature_extraction.text`.
    Similar feature extractors should be built for other kind of unstructured
    data input such as images, audio, video, ...
    Read more in the :ref:`User Guide <datasets>`.
    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    description : string or unicode, optional (default=None)
        A paragraph describing the characteristic of the dataset: its source,
        reference, etc.
    categories : A collection of strings or None, optional (default=None)
        If None (default), load all the categories. If not None, list of
        category names to load (other categories ignored).
    encoding : string or None (default is None)
        If None, do not try to decode the content of the files (e.g. for images
        or other non-text content). If not None, encoding to use to decode text
        files to Unicode if load_content is True.
    decode_error : {'strict', 'ignore', 'replace'}, optional
        Instruction on what to do if a byte sequence is given to analyze that
        contains characters not of the given `encoding`. Passed as keyword
        argument 'errors' to bytes.decode.
    random_state : int, RandomState instance or None, optional (default=0)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    Returns
    -------
    data : Bunch
        Dictionary-like object, the interesting attributes are: either
        data, the raw text data to learn, or 'filenames', the files
        holding it, 'target', the classification labels (integer index),
        'target_names', the meaning of the labels, and 'DESCR', the full
        description of the dataset.
    """

    folders = [f for f in sorted(listdir(container_path))
               if isdir(join(container_path, f))]

    if categories is not None:
        folders = [f for f in folders if f in categories]

    target = []
    target_names = []
    data = []
    files = []
    for label, folder in enumerate(folders):
        target_names.append(folder)
        folder_path = join(container_path, folder)
        documents = [join(folder_path, d)
                     for d in sorted(listdir(folder_path))]
        
        for filename in documents:
            with open(filename, 'rb') as f:
                for line in f:
                    line.rstrip()
                    data.append(line)
                    files.append(filename)
                    target.append(label)
    if encoding is not None:
        data = [d.decode(encoding, decode_error) for d in data]
        
    return Bunch(data=data,
                    filenames=files,
                    target_names=target_names,
                    target=target,
                    DESCR=description)

### Sentiment Analysis of Sentence in Japanese

In [108]:
import MeCab
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import sklearn
from sklearn.datasets import load_files
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

# loading all files as training data. 
datadir = r'data'
data_train = load_sentence_files(datadir, encoding='utf-8')

# creat instance of CountVectorizer using MeCab
wd = WordDividor('ipadic')
data_vec = CountVectorizer(min_df=1, analyzer=wd.extract_words)

# split to train and test
docs_train, docs_test, y_train, y_test = train_test_split(
    data_train.data, data_train.target, random_state = 12)

text_clf = Pipeline([('vect', data_vec),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())
                     ])
# Train a Multimoda Naive Bayes classifier
text_clf = text_clf.fit(docs_train, y_train)
# Predicting the Test set results, find accuracy
#y_pred = clf.predict(docs_test)
#sklearn.metrics.accuracy_score(y_test, y_pred)
pred_data = ['日本人は祖国に帰れ。', '日本人は祖国へ帰れ。','日本人は日本に帰れ。', '私は、無関心です。']
#pred = text_clf.predict_proba(pred_data)[:, 0]
pred = text_clf.predict(pred_data)
print(pred)

[0 0 0 1]
