# Описание ноутбука
Задача: объединить несколько датасетов и создать единую базу данных песен и их транскрипций

In [1]:
import os
import shutil
from tqdm import tqdm_notebook
import re
import pandas as pd

from chord_scripts import format_fname, secs_to_string, string_to_secs, time_delta

In [2]:
BASE_DIR = os.path.realpath(os.getcwd()+'../../..')
AUDIO_DIR = os.path.realpath(BASE_DIR+'/audio')
PARSED_DIR = os.path.realpath(BASE_DIR+'/parsed')

# Приводим названия файлов к унифицированному виду

## Перемещаем музыку

Создаём единый датасет загруженных песен

billboard датасет

In [3]:
bb_ds = pd.read_csv(BASE_DIR+'/dsets/billboard_songs_ds.csv', index_col=0)
bb_ds['ds']='billboard'
bb_ds.head()

Unnamed: 0,title,artist,number,duration,t_eps,href,status,ds
0,i_dont_mind,james_brown,3,02:31,1.0,,Downloaded,billboard
1,youve_got_a_friend,roberta_flack_and_donny_hathaway,4,03:27,0.0,,Downloaded,billboard
2,the_rose,bette_midler,6,03:41,-1.0,,Downloaded,billboard
3,an_innocent_man,billy_joel,10,05:18,0.0,,Downloaded,billboard
4,lookin_for_love,johnny_lee,12,03:32,0.0,,Downloaded,billboard


Датасет isophonics. Но я его называю chordlab

In [4]:
cl_ds = pd.read_csv(BASE_DIR+'/dsets/chordlab_songs_ds.csv', index_col=0)
cl_ds['ds']='chordlab'
cl_ds.head()

Unnamed: 0,title,artist,number,duration,t_eps,href,status,ds
0,your_mother_should_know,the_beatles,905,02:30,0.0,,Downloaded,chordlab
1,baby_youre_a_rich_man,the_beatles,910,03:04,0.0,,Downloaded,chordlab
2,flying,the_beatles,903,02:17,0.0,,Downloaded,chordlab
3,hello_goodbye,the_beatles,907,03:31,0.0,,Downloaded,chordlab
4,i_am_the_walrus,the_beatles,906,04:37,0.0,,Downloaded,chordlab


Общий датасет

In [5]:
un_ds = bb_ds[bb_ds['status']=='Downloaded']
un_ds = un_ds.append(cl_ds[cl_ds['status']=='Downloaded'])
un_ds = un_ds.drop('status', axis=1) # Т.к. статус везде = Downloaded
un_ds.index = range(len(un_ds))
un_ds.shape

(1029, 7)

Сохраняем общий датасет

In [6]:
un_ds.to_csv(BASE_DIR+'/dsets/united_songs_ds.csv')

Перемещаем все песни в общую папку

In [7]:
for ind, row in tqdm_notebook(un_ds.iterrows(), total=len(un_ds)):
    prename = 'bb' if row['ds'] == 'billboard' else 'cl'
    fname = '{}_{}-{}.mp3'.format(row['number'], row['artist'], row['title'])
    src_name = '{}/{}/{}'.format(AUDIO_DIR, row['ds'], fname)
    dst_name = '{}/{}/{}_{}'.format(AUDIO_DIR, 'united', prename, fname)
    shutil.copy(src_name, dst_name)

A Jupyter Widget




## Перемещаем транскрипции

### chordlab

Перемещаем транскрипции chordlab в единую папку

Транскрипции the beatles

In [8]:
cl_beatles = un_ds[(un_ds['ds']=='chordlab')&(un_ds['artist']=='the_beatles')]
beatles_albums = os.listdir(PARSED_DIR+'/chordlab/The_Beatles/')
for ind, row in cl_beatles.iterrows():
    alb_num = str(row['number'])[:-2]
    alb_num = '0'+alb_num if len(alb_num)==1 else alb_num
    alb = list(filter(lambda x: x.replace('CD','').startswith(alb_num), beatles_albums))[0]
    
    song_num = str(row['number'])[-2:]
    fname = '{}_-_{}.lab'.format(song_num, row['title'])
    nfname = 'cl_{}_{}-{}.lab'.format(row['number'], row['artist'], row['title'])
    shutil.copy(PARSED_DIR+'/chordlab/The_Beatles/'+alb+'/'+fname, PARSED_DIR+'/united/'+nfname)

Транскрипции carol king

In [9]:
cl_cking = un_ds[(un_ds['ds']=='chordlab')&(un_ds['artist']=='carol_king')]
for ind, row in cl_cking.iterrows():
    fname = '0{}_{}.lab'.format(row['number'], row['title'])
    nfname = 'cl_{}_{}-{}.lab'.format(row['number'], row['artist'],row['title'])
    shutil.copy(PARSED_DIR+'/chordlab/Carol_King/'+fname, PARSED_DIR+'/united/'+nfname)

Транскрипции queen

In [10]:
cl_queen = un_ds[(un_ds['ds']=='chordlab')&(un_ds['artist']=='queen')]
queen_albums = os.listdir(PARSED_DIR+'/chordlab/Queen/')
for ind, row in cl_queen.iterrows():
    alb_num = str(row['number'])[0]
    album = list(filter(lambda x: x.endswith(alb_num), queen_albums))[0]
    
    song_num = str(row['number'])[1:]
    fname = '{}_{}.lab'.format(song_num, row['title'])
    nfname = 'cl_{}_{}-{}.lab'.format(row['number'], row['artist'],row['title'])
    shutil.copy(PARSED_DIR+'/chordlab/Queen/'+album+'/'+fname, PARSED_DIR+'/united/'+nfname)

### billboard

In [11]:
bb_dled = un_ds[un_ds['ds']=='billboard']
for row in tqdm_notebook(bb_dled.iterrows(), total=len(bb_dled), desc='Walking rows'):
    ind, row = row
    number = str(row['number'])
    number = '0'*(4-len(number))+number
    fname = '{}/billboard/{}/full.lab'.format(PARSED_DIR,number)
    
    nfname = '{}/united/bb_{}_{}-{}.lab'.format(PARSED_DIR,row['number'],row['artist'],row['title'])
    shutil.copy(fname, nfname)

A Jupyter Widget




In [12]:
print('Все транскрипции перемещены:',len(os.listdir(PARSED_DIR+'/united/'))==len(un_ds))

Все транскрипции перемещены: True


# Проверка наличия всех пар песен и транскрипций

In [13]:
for f in os.listdir(AUDIO_DIR+'/united'):
    if not os.path.isfile(PARSED_DIR+'/united/'+f[:-3]+'lab'):
        print('Аудио',f,'нет транскрипции')
for f in os.listdir(PARSED_DIR+'/united'):
    if not os.path.isfile(AUDIO_DIR+'/united/'+f[:-3]+'mp3'):
        print('Транскрипция',f,'нет аудио')

Ничего не вывелось, значит названия всех файлов совпадают