<a href="https://colab.research.google.com/github/Eihiro/roma/blob/master/NHK_blackout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 環境構築

In [0]:
!dpkg --configure -a

In [0]:
!pip install -q scrapy
!apt install -q aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install -q mecab-python3==0.7
!pip install -q pydrive

!pip install -q polyglot
!pip install -q pyicu
!pip install -q pycld2
!pip install -q morfessor

### GCP接続

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

### scrapy設定

In [0]:
# !scrapy startproject blackout

In [0]:
# %cd blackout
# !scrapy genspider google google.com -a "groovenauts"
# %cd ..

### クローラのリストア

In [4]:
!gsutil -m cp -r gs://scrape-saishu/blackout  .

Copying gs://scrape-saishu/blackout/blackout/__init__.py...
/ [0/18 files][    0.0 B/914.5 KiB]   0% Done                                   / [1/18 files][    0.0 B/914.5 KiB]   0% Done                                   Copying gs://scrape-saishu/blackout/blackout/__pycache__/settings.cpython-36.pyc...
/ [1/18 files][    0.0 B/914.5 KiB]   0% Done                                   Copying gs://scrape-saishu/blackout/blackout/__pycache__/__init__.cpython-36.pyc...
/ [1/18 files][    0.0 B/914.5 KiB]   0% Done                                   Copying gs://scrape-saishu/blackout/blackout/items.py...
Copying gs://scrape-saishu/blackout/blackout/middlewares.py...
Copying gs://scrape-saishu/blackout/blackout/pipelines.py...
/ [1/18 files][    0.0 B/914.5 KiB]   0% Done                                   / [1/18 files][    0.0 B/914.5 KiB]   0% Done                                   / [1/18 files][    0.0 B/914.5 KiB]   0% Done                                   Copying gs://scrape-sais

### クロール実施

In [0]:
%cd blackout
!scrapy crawl google
%cd ..

### Doc2Vec関連ライブラリ読み込み

In [0]:
from gensim.models import word2vec
import MeCab

### 形態素解析

In [0]:
from polyglot.text import Text

def split_into_words(text):
    mecab = MeCab.Tagger("-Ochasen")
    lines = mecab.parse(text).splitlines()
    words = []
    for line in lines:
        chunks = line.split('\t')
#        if len(chunks) > 3 and (chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))):
        if len(chunks) > 3 and ( chunks[3].startswith('名詞-一般') or chunks[3].startswith('名詞-サ変接続') or chunks[3].startswith('名詞-形容動詞語幹') ):
            print(chunks)
            words.append(chunks[0])
    return words

### Doc2Vec学習

In [0]:
trainings = []

articles = open("./blackout/data/articles.txt", 'r')

for line in articles:
  words = split_into_words(line)

  for word in words:
    if word != "" :
      trainings.append(word)
      
print(trainings)

In [0]:
model = word2vec.Word2Vec([trainings], size=100,min_count=1,window=5,iter=100)

### modelの保存

In [9]:
model.save("./blackout/model/article_model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### 可視化用データ作成

In [0]:
def save_embedding_projector_files(model, vector_file, metadata_file):
    with open(vector_file, 'w', encoding='utf-8') as f, \
         open(metadata_file, 'w', encoding='utf-8') as g:

        # metadata file needs header
        # g.write('Word\n')

        for word in model.wv.vocab.keys():
            embedding = model.wv[word]

            # Save vector TSV file
            f.write('\t'.join([('%f' % x) for x in embedding]) + '\n')

            # Save metadata TSV file
            g.write(word + '\n')

In [0]:
save_embedding_projector_files(model, "./blackout/model/vector", "./blackout/model/metadata")

In [43]:
!gsutil cp ./blackout/model/* gs://scrape-saishu/blackout/model/

Copying file://./blackout/model/article_model [Content-Type=application/octet-stream]...
Copying file://./blackout/model/metadata [Content-Type=application/octet-stream]...
Copying file://./blackout/model/vector [Content-Type=application/octet-stream]...
-
Operation completed over 3 objects/878.6 KiB.                                    


### カウント

In [13]:
import collections

c = dict(collections.Counter(trainings))

c = sorted(c.items(), reverse=True, key=lambda x : x[1])

print(c)
# Counter({'a': 4, 'c': 2, 'b': 1})


[('停電', 79), ('設備', 50), ('電源', 38), ('使用', 26), ('機能', 25), ('停止', 24), ('利用', 24), ('確認', 23), ('消防', 22), ('発電', 20), ('機器', 18), ('危険', 17), ('自家', 17), ('コンセント', 17), ('発生', 16), ('事項', 16), ('確保', 16), ('電気', 16), ('プラグ', 16), ('防止', 15), ('データ', 15), ('防火', 14), ('施設', 14), ('非常', 14), ('燃料', 14), ('必要', 13), ('センター', 13), ('作動', 12), ('計画', 12), ('火災', 11), ('避難', 11), ('システム', 11), ('対象', 10), ('対応', 10), ('常用', 10), ('消火', 9), ('留意', 9), ('家庭', 9), ('お願い', 9), ('ベッド', 9), ('取扱', 9), ('体制', 8), ('点検', 8), ('関係', 8), ('通電', 8), ('製品', 8), ('終了', 8), ('コンピューター', 8), ('管理', 7), ('方法', 7), ('経路', 7), ('自動', 7), ('予想', 7), ('スイッチ', 7), ('電化', 7), ('恐れ', 7), ('対策', 6), ('次', 6), ('ガス', 6), ('事前', 6), ('復旧', 6), ('相談', 6), ('運転', 6), ('補給', 6), ('実施', 6), ('注意', 6), ('電話', 6), ('上げ', 6), ('センサー', 6), ('供給', 6), ('規模', 5), ('設置', 5), ('操作', 5), ('徹底', 5), ('一般', 5), ('設定', 5), ('措置', 5), ('サービス', 5), ('不能', 5), ('通報', 5), ('介護', 5), ('皆様', 5), ('背', 5), ('手元', 5), ('充電', 5), ('サーバ', 5

### 類似度計算

In [14]:
ret = model.wv.most_similar(positive=['停電']) 
for item in ret:
    print(item[0], item[1])


措置 0.9791111946105957
防止 0.9735440015792847
設定 0.971636950969696
無理 0.9711635112762451
注意 0.9705602526664734
確実 0.9702361822128296
レンジ 0.966916561126709
おすすめ 0.9657365083694458
逆 0.9656738042831421
出火 0.9638320207595825


  if np.issubdtype(vec.dtype, np.int):


In [15]:
out = model.most_similar(positive=[u"停電", u"システム"], negative=[u"コンピュータ"])
for x in out:
  print(x[0])

出火
無理
注意
おすすめ
V
レンジ
逆
部品
自転車
位置


  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):
