In [None]:
# 下記セルを実行すると、authorization codeの入力を求められます。
# 出力されたリンク先をクリックし、Googleアカウントにログインし、
# authorization codeをコピーし、貼り付けをおこなってください。
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
project = 'sample_data'
chapter = 4
os.chdir(f'/content/drive/MyDrive/{project}/chapter-{chapter}/')

# ４章 言語データの加工・可視化を行う１０本ノック

## ノック５１：テキストファイルを読み込もう

In [None]:
ls data/

In [None]:
with open('data/hashire_merosu.txt', mode='r', encoding='shift-jis') as f:
  content = f.read()
print(content)

## ノック５２：本文を抽出して１つに纏めよう

In [None]:
content = ' '.join(content.split())
content

In [None]:
import unicodedata
content = unicodedata.normalize('NFKC', content)
content

In [None]:
import re
pattern = re.compile(r'^.+(#地から1字上げ].+#地から1字上げ]).+$')
body = re.match(pattern, content).group(1)
print(body)

In [None]:
body = body.replace('#地から1字上げ] ------------------------------------------------------- ', '')
body = body.replace(' [#地から1字上げ]', '')
body

## ノック５３：本文以外の項目を取り出そう

In [None]:
with open('data/hashire_merosu.txt', mode='r', encoding='shift-jis') as f:
  title = f.readline()
  author = f.readline()
print(title)
print(author)

In [None]:
title = title.replace('\n', '')
print(title)
author = author.replace('\n', '')
print(author)

In [None]:
with open('data/hashire_merosu.txt', mode='r', encoding='shift-jis') as f:
  content = f.readlines()
content

In [None]:
import pandas as pd
df = pd.DataFrame(content, columns=['text'])
df['text'] = df['text'].str.replace('\n', '')
df

In [None]:
date = df[(df['text'].str.contains('日公開'))|(df['text'].str.contains('日修正'))].copy()
print(date)

date['text'] = date['text'].str.replace('公開', '')
date['text'] = date['text'].str.replace('修正', '')
print(date)

date['text'] = date['text'].str.replace('年', '/')
date['text'] = date['text'].str.replace('月', '/')
date['text'] = date['text'].str.replace('日', '')
print(date)

date['text'] = pd.to_datetime(date['text'])
print(date)
date.dtypes

In [None]:
release_date = date.iat[0, 0]
update_date = date.iat[1, 0]
print(release_date)
print(update_date)
date = update_date - release_date
print(date)

In [None]:
booklist = pd.DataFrame([[title, author, release_date, update_date, body]], columns=['title', 'author', 'release_date', 'update_date', 'body'])
booklist

## ノック５４：形態素解析で単語に分割しよう

In [None]:
%%bash

apt install -yq \
  mecab \
  mecab-ipadic-utf8 \
  libmecab-dev
pip install -q mecab-python3
ln -s /etc/mecabrc /usr/local/etc/mecabrc

In [None]:
pip list | grep mecab

In [None]:
import MeCab
tagger = MeCab.Tagger()
body = booklist.iloc[0, 4]
parsed = tagger.parse(body).split('\n')
parsed[:4]

In [None]:
parsed[-4:]

In [None]:
parsed = parsed[:-2]
parsed[-4:]

## ノック５５：分割した単語をデータフレームで保持しよう


In [None]:
*values, = map(lambda s: re.split(r'\t|,', s), parsed)
values[:4]

In [None]:
import pandas as pd
columns = ['表層形', '品詞', '品詞細分類1', '品詞細分類2', '品詞細分類3', '活用型', '活用形', '原形', '読み', '発音']
mecab_df = pd.DataFrame(data=values, columns=columns)
print(len(mecab_df))
mecab_df.head(4)

## ノック５６：名詞と動詞を取り出そう

In [None]:
print(mecab_df.groupby(['原形','品詞']).size().sort_values(ascending=False))

In [None]:
noun = mecab_df.loc[mecab_df['品詞'] == '名詞']
noun

In [None]:
verb = mecab_df.loc[(mecab_df['品詞'] == '名詞')|(mecab_df['品詞'] == '動詞')]
verb

## ノック５７：不要な単語を除外しよう

In [None]:
with open('data/stop_words.txt', mode='r') as f:
  stop_words = f.read().split()
stop_words

In [None]:
print(len(noun))
noun = noun.loc[~noun['原形'].isin(stop_words)]
print(len(noun))
display(noun.head())

print(len(verb))
verb = verb.loc[~verb['原形'].isin(stop_words)]
print(len(verb))
display(verb.head())

## ノック５８：単語の使用状況をグラフで可視化しよう

In [None]:
count = noun.groupby('原形').size().sort_values(ascending=False)
count.name = 'count'
count = count.reset_index().head(10)
count

In [None]:
!pip install -q japanize-matplotlib

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
plt.figure(figsize=(10, 5))
sns.barplot(x=count['count'], y=count['原形'])

## ノック５９：単語の傾向をWordCloudで可視化してみよう

In [None]:
!apt-get -yq install fonts-ipafont-gothic

In [None]:
ls /usr/share/fonts/opentype/ipafont-gothic

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import japanize_matplotlib
font_path = 'usr/share/fonts/opentype/ipafont-gothic/ipagp.ttf'
cloud = WordCloud(background_color='white', font_path=font_path, regexp=r"\w{2,}").generate(' '.join(noun['原形'].values))
plt.figure(figsize=(10, 5))
plt.imshow(cloud)
plt.axis("off")
plt.savefig('data/wc_noun_base_2.png')
plt.show()

In [None]:
cloud = WordCloud(background_color='white', font_path=font_path).generate(' '.join(noun['原形'].values))
plt.figure(figsize=(10, 5))
plt.imshow(cloud)
plt.axis("off")
plt.savefig('data/wc_noun_base_1.png')
plt.show()

In [None]:
cloud = WordCloud(background_color='white', font_path=font_path, regexp=r"\w{2,}").generate(' '.join(noun['表層形'].values))
plt.figure(figsize=(10, 5))
plt.imshow(cloud)
plt.axis("off")
plt.savefig('data/wc_noun_surface.png')
plt.show()

In [None]:
cloud = WordCloud(background_color='white', font_path=font_path, regexp=r"\w{2,}").generate(' '.join(verb['原形'].values))
plt.figure(figsize=(10, 5))
plt.imshow(cloud)
plt.axis("off")
plt.savefig('data/wc_noun-verb_base.png')
plt.show()

## ノック６０：n-gramを作ってみよう

In [None]:
target = mecab_df['表層形'].to_list()
len(target)

In [None]:
from nltk import ngrams
bigram = ngrams(target, 2)

In [None]:
import collections
counter = collections.Counter(bigram)
print(counter)