# karuuta_corpus

##  カラオケ人気曲の取得

In [1]:
!pip install pandas beautifulsoup4 lxml html5lib tqdm



In [2]:
!mkdir -p ./csv ./lyrics

In [3]:
import pandas as pd

In [4]:
# カラ鉄のWebサイトからカラオケランキング表を取得
karaoke_ranking_url = 'https://www.karatetsu.com/ranking/total/2019'
karaoke_ranking_dfs = pd.read_html(karaoke_ranking_url, displayed_only=False)

In [5]:
# 100位ごと表が分割されているので結合
karaoke_ranking_df = pd.concat(karaoke_ranking_dfs)
# 途中の見出し行を削除
karaoke_ranking_df = karaoke_ranking_df[karaoke_ranking_df['順位'] != '順位']
len(karaoke_ranking_df)

10000

In [6]:
karaoke_ranking_df.head()

Unnamed: 0,順位,曲名,歌手名,共通番号
0,﻿1位,Ｌｅｍｏｎ,米津　玄師,725614
1,2位,マリーゴールド,あいみょん,750270
2,3位,Ｐｒｅｔｅｎｄｅｒ,Ｏｆｆｉｃｉａｌ髭男ｄｉｓｍ,765858
3,4位,小さな恋のうた,ＭＯＮＧＯＬ８００,56612
4,5位,さよならエレジー,菅田　将暉,724828


In [7]:
# normalize
karaoke_ranking_df['曲名'] = karaoke_ranking_df['曲名'].str.normalize('NFKC')
karaoke_ranking_df['曲名'] = karaoke_ranking_df['曲名'].str.replace('’', '\'')
karaoke_ranking_df['歌手名'] = karaoke_ranking_df['歌手名'].str.normalize('NFKC')
karaoke_ranking_df['歌手名'] = karaoke_ranking_df['歌手名'].str.replace('’', '\'')
karaoke_ranking_df.head()

Unnamed: 0,順位,曲名,歌手名,共通番号
0,﻿1位,Lemon,米津 玄師,725614
1,2位,マリーゴールド,あいみょん,750270
2,3位,Pretender,Official髭男dism,765858
3,4位,小さな恋のうた,MONGOL800,56612
4,5位,さよならエレジー,菅田 将暉,724828


In [8]:
# CSVに保存
karaoke_ranking_df.to_csv('csv/karaoke_ranking.csv', encoding = "utf_8_sig")

##  歌詞の取得

In [9]:
import pandas as pd
import urllib.request
import time
import re
import csv
import os
from bs4 import BeautifulSoup
from tqdm import tqdm

In [10]:
def get_lyric_filename(title, artist):
    return title.replace('/', '') + '_' + artist.replace('/','') + '.txt'

In [11]:
# 歌詞をJ-Lyricから取得
def fetch_lyric(url):
    response = urllib.request.urlopen(url)
    time.sleep(1)
    bs = BeautifulSoup(response.read(), 'lxml')
    element = bs.find('div', id='mnb').find('div', class_='lbdy').find('p', id='Lyric')
    lyric = str(element).replace('<p id="Lyric">', '').replace('</p>', '').replace('<br/>', '\n')
    return lyric

In [12]:
def filter_for_keyword(str):
    str = re.sub(r'[\(\-〜].+?[\)\-〜]', '', str)
    str = str.replace('･', ' ').replace('…', ' ').replace('＆', ' ')
    return str

def fetch_lyric_url(title, artist):
    payload = {
        'kt': filter_for_keyword(title),
        'ct': 2,
        'ka': filter_for_keyword(artist),
        'ca': 2,
        'kl': '',
        'cl': 2
    }
    query = urllib.parse.urlencode(payload)
    search_url = 'http://search2.j-lyric.net/index.php?' + query
    response = urllib.request.urlopen(search_url)
    time.sleep(1)
    bs = BeautifulSoup(response.read(), 'lxml')
    element = bs.find('div', id='mnb').find('div', class_='bdy').find('p', class_='mid').find('a')
    lyric_url = element.get('href')
    return lyric_url

In [13]:
def store_lyric(title, artist, lyric_url=None):
    lyric_filename = get_lyric_filename(title, artist)
    if os.path.exists('lyrics/' + lyric_filename):
        # do nothing
        return
    try:
        if lyric_url is None:
            lyric_url = fetch_lyric_url(title, artist)
        lyric = fetch_lyric(lyric_url)
    except AttributeError:
        with open('csv/fetch_lyric_errors.csv', 'a') as f:
            writer = csv.writer(f, lineterminator='\n')
            writer.writerow([title, artist])
            return
    with open('lyrics/' + lyric_filename, 'w') as f:
        f.write(lyric)

In [14]:
karaoke_ranking_df = pd.read_csv('csv/karaoke_ranking.csv')
# 時間がかかるので上位1000件に絞る
karaoke_ranking_df = karaoke_ranking_df.head(1000)
karaoke_ranking_df.tail()

Unnamed: 0.1,Unnamed: 0,順位,曲名,歌手名,共通番号
995,96,996位,憂、燦々(ゆう、さんさん),クリープハイプ,609360
996,97,997位,ロードトゥ饅頭マスター!,チャン&ユン[濱 健人/小西 成弥],749181
997,98,998位,イマジネーション,SPYAIR,595699
998,99,999位,世界はそれを愛と呼ぶんだぜ,サンボマスター,158705
999,100,1000位,センセンフコク,石谷 春貴,723913


In [15]:
for title, artist in tqdm(zip(karaoke_ranking_df['曲名'], karaoke_ranking_df['歌手名'])):
    store_lyric(title=title, artist=artist)

1000it [00:33, 29.86it/s]


In [16]:
# 取得エラーになった楽曲は手動で検索して取得
# csv/fetch_lyrics_manually.csv
# ```
# title,artist,lyric_url
# シャルル,バルーン feat.flower,https://j-lyric.net/artist/a05ccf6/l041b1b.html
# ...
# ```

fetch_manually_df = pd.read_csv('csv/fetch_lyrics_manually.csv')
for title, artist, lyric_url in tqdm(zip(fetch_manually_df['title'], fetch_manually_df['artist'], fetch_manually_df['lyric_url'])):
    store_lyric(title=title, artist=artist, lyric_url=lyric_url)

70it [00:00, 61941.20it/s]


## 狩歌の語句を含むかチェック

In [17]:
!pip install pandas mecab-python3 tqdm



In [18]:
!brew install mecab mecab-ipadic

To reinstall 0.996, run:
  brew reinstall mecab
To reinstall 2.7.0-20070801, run:
  brew reinstall mecab-ipadic


In [19]:
import MeCab
from tqdm import tqdm
import numpy as np
import pandas as pd

In [20]:
tagger = MeCab.Tagger('-Ochasen')
tagger.parse('おはようございます。')

'おはよう\tオハヨウ\tおはよう\t感動詞\t\t\nござい\tゴザイ\tござる\t助動詞\t五段・ラ行特殊\t連用形\nます\tマス\tます\t助動詞\t特殊・マス\t基本形\n。\t。\t。\t記号-句点\t\t\nEOS\n'

In [21]:
karaoke_ranking_df = pd.read_csv('csv/karaoke_ranking.csv')
# 時間がかかるので上位1000件に絞る
karaoke_ranking_df = karaoke_ranking_df.head(1000)
karuuta_terms_df = pd.read_csv('karuuta_terms.csv')
karuuta_terms_df.head()

Unnamed: 0,card_id,word,ruby,point
0,1,君,キミ,1
1,2,あなた,アナタ,1
2,3,人,ヒト,1
3,4,僕,ボク,1
4,5,今,イマ,1


In [22]:
# カードごとのカラムを持つ行の雛形を生成
keys = []
keys.append('title')
keys.append('artist')
types_dict = {
    'title': 'string',
    'artist': 'string',
}
for card_id in karuuta_terms_df['card_id']:
    types_dict[card_id] = 'int64'
    keys.append(card_id)

In [23]:
karuuta_corpus_df = pd.DataFrame(index=[], columns=keys)

for title, artist in tqdm(zip(karaoke_ranking_df['曲名'], karaoke_ranking_df['歌手名'])):
    lyric_filename = get_lyric_filename(title, artist)
    if not os.path.exists('lyrics/' + lyric_filename):
        continue
    row = np.concatenate([np.array([title, artist]), np.zeros(100, dtype='int')])
    karuuta_corpus_df = pd.concat([karuuta_corpus_df, pd.DataFrame([row], columns=keys)])

karuuta_corpus_df = karuuta_corpus_df.reset_index(drop=True)
for card_id in karuuta_terms_df['card_id']:
    karuuta_corpus_df[card_id] = pd.to_numeric(karuuta_corpus_df[card_id], errors='coerce')
karuuta_corpus_df.head()

1000it [00:04, 241.47it/s]


Unnamed: 0,title,artist,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100
0,Lemon,米津 玄師,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,マリーゴールド,あいみょん,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pretender,Official髭男dism,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,小さな恋のうた,MONGOL800,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,さよならエレジー,菅田 将暉,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
for index, row in tqdm(karuuta_corpus_df.iterrows()):
    with open('lyrics/' + get_lyric_filename(row['title'], row['artist']), mode='r') as f_lyric:
        for line in f_lyric:
            node = tagger.parseToNode(line)
            while node:
                wclass = node.feature.split(',')
                for card_id, word, ruby in zip(karuuta_terms_df['card_id'], karuuta_terms_df['word'], karuuta_terms_df['ruby']):
                    if karuuta_corpus_df.loc[index, card_id] == 0 and word in wclass[6] and ruby in wclass[7]:
                        karuuta_corpus_df.loc[index, card_id] = 1
                node = node.next
karuuta_corpus_df.head()

970it [06:18,  2.56it/s]


Unnamed: 0,title,artist,1,2,3,4,5,6,7,8,...,91,92,93,94,95,96,97,98,99,100
0,Lemon,米津 玄師,0,1,0,0,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0
1,マリーゴールド,あいみょん,1,0,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,0,1
2,Pretender,Official髭男dism,1,0,0,1,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0
3,小さな恋のうた,MONGOL800,0,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,さよならエレジー,菅田 将暉,1,0,0,1,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [25]:
karuuta_corpus_df.to_csv('csv/karuuta_corpus.csv')