Wikiのコーパスを作成。

[公式による英語のWikiのコーパスの作成](https://github.com/piskvorky/gensim/blob/develop/gensim/scripts/make_wikicorpus.py)


In [1]:
!curl -O https://dumps.wikimedia.org/jawiki/latest/jawiki-latest-pages-articles.xml.bz2

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0 4024M    0  207k    0     0   125k      0  9:08:11  0:00:01  9:08:10  125k^C


In [35]:
# coding: utf-8

"""USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX
"""

import logging
import os.path
import sys

import gensim.corpora.wikicorpus as wikicorpus
from gensim.corpora import Dictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel
from gensim.utils import to_unicode
from gensim.corpora.wikicorpus import tokenize

import MeCab


# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
DEFAULT_DICT_SIZE = 100000

tagger = MeCab.Tagger('-r /opt/homebrew/etc/mecabrc -d /opt/homebrew/lib/mecab/dic/ipadic')
tagger.parse('')


def tokenize_ja(text):
    node = tagger.parseToNode(to_unicode(text,  encoding='utf8', errors='ignore'))
    while node:
        if node.feature.split(',')[0] == '名詞':
            yield node.surface.lower()
        node = node.next


def tokenize(content):
    return [
        to_unicode(token) for token in tokenize_ja(content)
        if 2 <= len(token) <= 15 and not token.startswith('_')
    ]

wikicorpus.tokenize = tokenize


In [36]:
# 処理が2日ほど待機していることが続くため独自に処理を書く必要がある可能性

# https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/scripts/make_wikicorpus.py
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))

src = '../data/jawiki-latest-pages-articles.xml.bz2'
dst = 'test'

wiki = WikiCorpus(src, tokenizer_func=tokenize)


2024-09-17 10:32:30,768 : INFO : running /Users/mfujimak/Library/Python/3.9/lib/python/site-packages/ipykernel_launcher.py --f=/Users/mfujimak/Library/Jupyter/runtime/kernel-v37fda39f4835194ba42b0208c3774e1e3b7b57962.json
Process SpawnPoolWorker-108:
Process SpawnPoolWorker-109:
Process SpawnPoolWorker-105:
Process SpawnPoolWorker-104:
Process SpawnPoolWorker-107:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Applications/Xcode.app/Contents/Developer/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = g

In [None]:
# only keep the most frequent words
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)

# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(dst + '_bow.mm', wiki, progress_cnt=10000, metadata=True)
wiki.dictionary.save_as_text(dst + '_wordids.txt.bz2')

# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(dst + '_wordids.txt.bz2')

del wiki

# initialize corpus reader and word->id mapping
mm = MmCorpus(dst + '_bow.mm')

# build tfidf
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(dst + '.tfidf_model')

# save tfidf vectors in matrix market format
MmCorpus.serialize(dst + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

logger.info('finished running %s' % program)