In [1]:
import six
import os
import sklearn.neighbors
import itertools
import IPython
import numpy

import chainer.serializers

import illust2comment
import illust2comment.model
import illust2comment.utility

In [2]:
# arguments
# 実際のパスで置き換えてください
GPU = -1
IMAGE_MODEL_PATH = "data/nico_illust_tag_v2.hdf5"
COMMENT_DATA = "data/seiga_comment_random.tsv"

COMMENT_MODEL_PATH = "data/nico_comment_feature_v1.hdf5"
VOCABULARY_PATH = "data/nico_comment_feature_v1_vocabulary.txt"

QUERY_IMAGE_URL = "http://3d.nicovideo.jp/alicia/img/profile_character.png"

TARGET_COMMENTS_NUM = 1000
HIDDEN_UNIT = 1024

CHARACTER_START = "\n"
CHARACTER_END = ""

In [3]:
vocabulary = [line.rstrip().decode("utf-8") for line in open(VOCABULARY_PATH)]
character_embedder = illust2comment.model.WordEmbedder(vocabulary)
comment_model = illust2comment.model.FeatureWordModel(vocab_size=character_embedder.vecsize, midsize=HIDDEN_UNIT, output_feature_size=4096)
chainer.serializers.load_hdf5(COMMENT_MODEL_PATH, comment_model)
image_model = illust2comment.model.ImageModel(406)
chainer.serializers.load_hdf5(IMAGE_MODEL_PATH, image_model.functions)
if GPU >= 0:
    chainer.cuda.check_cuda_available()
    chainer.cuda.get_device(GPU).use()
    xp = chainer.cuda.cupy
    image_model.functions.to_gpu()
    comment_model.to_gpu()
else:
    xp = numpy

In [4]:
class ImageFeatureExtractor(object):
    def __init__(self, image_model, xp):
        self.xp = xp
        self.image_model = image_model

    def get_image_feature_from_url(self, image_url):
        return self.get_image_feature(six.moves.urllib.request.urlopen(image_url))

    def get_image_feature(self, image_path):
        # 学習時のバグにより二重に平均画像(128)を引いていることに注意してください。ごめんなさい。
        img_array = self.xp.array(illust2comment.utility.img2array(illust2comment.utility.load_image(image_path)) - 128)
        return chainer.cuda.to_cpu(image_model.feature(img_array).data)[0]
    
class CommentFeatureExtractor(object):
    def __init__(self, comment_model, character_embedder, xp):
        self.xp = xp
        self.comment_model = comment_model
        self.character_embedder = character_embedder
    
    def get_comment_feature(self, comment):
        comment_model.reset_state()
        character_list = ([CHARACTER_START] + list(comment) + [CHARACTER_END]*30)[:30]

        ### comment features
        predicted = None
        for character in character_list:
            char_id = self.character_embedder.embed_id(character)
            each_predicted = comment_model.feature(
                chainer.Variable(self.xp.array([char_id], dtype=self.xp.int32), volatile=True),
            )
        predicted = each_predicted
        return chainer.cuda.to_cpu(predicted.data)[0]
    

In [5]:
comment_feature_extractor = CommentFeatureExtractor(comment_model, character_embedder, xp=xp)
image_feature_extractor = ImageFeatureExtractor(image_model, xp=xp)

In [6]:
features = []
comments = []
n = 0
import time
print(time.time())
for content_id, comment in illust2comment.utility.load_id_comments(COMMENT_DATA):
    if comment in comments:
        continue
        
    n += 1
    feature = comment_feature_extractor.get_comment_feature(comment)
    features.append(feature)
    comments.append(comment)
    if n % 100 == 0:
        print(n)
    if n > TARGET_COMMENTS_NUM:
        break
print(time.time())


1458515970.65
100
200
300
400
500
600
700
800
900
1000
1458516206.46


  return 1 / (1 + numpy.exp(-x))


In [7]:
engine = sklearn.neighbors.NearestNeighbors(n_neighbors=10, algorithm='auto', metric='euclidean')
engine.fit(features)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

In [8]:
image_url = QUERY_IMAGE_URL
# IPython.display.display(IPython.display.Image(url=image_url))
feature = image_feature_extractor.get_image_feature_from_url(image_url)
distances_batch, target_indices_batch = engine.kneighbors([feature])
for distance, target_index in itertools.izip(distances_batch[0], target_indices_batch[0]):
    print(distance)
    print(comments[target_index])



3791.24748021
ﾝｷﾞﾓﾁﾞｲｲｲｲ!!ｵｫﾝ!
3796.25526048
工藤艦長「」ｶﾞﾀｯ
3802.24265758
E-5は道中大破多すぎる・・・航巡入れて制空権とりつつ川内に夜偵積んで落としたなあ
3812.82832493
加賀さんが変態すぎるｗｗｗｗ
3813.08317757
ウチは扶桑姉妹で駄目だったので投入…楽々殴ってくれます
3814.73396231
↑まな板が二枚・・・くるぞ提督！
3817.62022437
加賀「ここは王道の肉まんと元祖のあんまんと魅惑のピザまんですね。ここは譲れません。」
3819.7090064
駆逐艦(蟹工船)
3820.50160312
提督「青葉よくやった。次は伊勢と古鷹だ。わかってるな」
3820.95136821
葛城の機関部は陽炎型のだから無問題(ｦｨ 龍鳳? 知らない子ですね


  self.y = 1 / (1 + numpy.exp(-x[0]))
