# Описание ноутбука
Парсинг песен из датасета Billboard

In [177]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pydub import AudioSegment

from chord_scripts import format_fname, secs_to_string, string_to_secs, time_delta

from tqdm import tqdm_notebook
import re
import os

In [178]:
BASE_DIR = os.path.realpath(os.getcwd()+'../../..')

# Функции для парсинга

Во избежание скачивания каверов и т.п.

In [179]:
not_include = [r'\bcover\b', r'\bremix\b', r'\blive\b', 'исполнен', r'\bkaraoke\b'] # Запрещённые слова для названий песен todo: 'Live'?

Максимальное расхождение загруженной и распарсенной песни во времени в секундах

In [4]:
max_eps = 2

Загрузка записи в определённый файл

In [10]:
def download_file(artist, title, number, href, ds_name):
    dname = '/audio/'+ds_name+'/'
    fname = format_fname('{}_{}-{}.mp3'.format(number, artist,title))
    try:
        with open(BASE_DIR+dname+fname, "wb") as file:
            response = requests.get(href)
            file.write(response.content)
    except:
        raise Exception('Found but not downloaded')

Проверка правильности загрузки файла

In [11]:
def check_correct_dl(artist, title, number, duration, eps, ds_name):
    fname = BASE_DIR+'/audio/'+ds_name+format_fname('/{}_{}-{}.mp3'.format(number,artist,title))
    audio = AudioSegment.from_mp3(fname)
    dur_real = round(audio.duration_seconds)
    del audio
    return True if dur_real == string_to_secs(duration)+eps else False

In [12]:
check_correct_dl('25_or_6_to_4','chicago',176,'4:50',-1, ds_name='billboard')

True

Проверка аттрибутов песни: автора 'a', названия 't', длительности 'd'

In [13]:
def check_attributes(a,t,d,artist, title, duration, to_print = False):
    a,t,artist,title = map(lambda x: format_fname(x.lower(), ' '), [a,t,artist,title])
    # Если кавер или ремикс (автор и название могут быть перепутаны местами)
    if len(list(filter(lambda x: x in t, not_include)))>0\
     or len(list(filter(lambda x: x in a, not_include)))>0:
        if to_print: print('is cover')        
        return False
    # Если автор не совпадает с заданным (автор и название могут быть перепутаны местами)
    if len(list(filter(lambda x: x in a, artist.split()))) != len(artist.split())\
     and len(list(filter(lambda x: x in t, artist.split()))) != len(artist.split()):
        if to_print: print('wrong artist: {} {}'.format(artist,a))
        return False
    # Если название не совпадает с заданным (автор и название могут быть перепутаны местами)
    if len(list(filter(lambda x: x in t, title.split()))) != len(title.split())\
     and len(list(filter(lambda x: x in a, title.split()))) != len(title.split()):
        if to_print: print('wrong title: {} {}'.format(title, t))
        return False
    try:
        d = time_delta(needed=duration, real=d)
    except:
        return False
    if abs(d) > max_eps:
        return False
    return True

## Парсинг сайтов

### ipleer.fm

Нахождение информации о длительности музыки на сайте ipleer

In [8]:
def parse_duration_ipleer(track_html):
    d = track_html.find('em')
    while True:
        try:
            d.div.decompose()
        except:
            break
    while True:
        try:
            d.a.decompose()
        except:
            break
    return d.text.split()[0]

Парсинг самого сайта

In [14]:
def parse_ipleer(artist, title, duration):
    SEARCH_URL = 'https://ipleer.fm/search/q/'
    SONGS_URL = 'https://ipleer.fm'
    
    url = format_fname('{}{}+{}/'.format(SEARCH_URL, artist, title), space_replacer='+')
    list_page = requests.get(url=url)
    
    if list_page.status_code != 200:
        raise Exception('List_page_not_found')
    
    list_soup = BeautifulSoup(list_page.text, 'html.parser')
    tracks = list_soup.find_all('li',class_='track')
    ntracks = []
    for track_id,track in enumerate(tracks):
        try:
            a = track.find('span',class_='cartist').text
            t = track.find('span',class_='ctitle').text
            d = parse_duration_ipleer(track)
            h = track.find('a',class_='playlist-down').get('href')
        except:
            a,t,d, h = None, None, None, None
        if a and t and d and h:
            if check_attributes(a,t,d,artist,title,duration):
                d = time_delta(d,duration)
                ntracks.append({'artist':a, 'title':t, 'time_delta':d, 'href':h, 'track':track})
    
    if len(ntracks)==0:
        raise Exception('No_correct_songs_found',url)
    
    ntracks = sorted(ntracks, key= lambda x: abs(x['time_delta']))
    
    dur_eps = 0
    for track in ntracks:
        download_page = requests.get(SONGS_URL+track['href'])
        if download_page.status_code == 200:
            dur_eps = track['time_delta']
            break
    if download_page.status_code != 200:
        raise Exception('Download_page_not_found')
        
    download_soup = BeautifulSoup(download_page.text, 'html.parser')
    download_href = download_soup.find('a',class_='onesongblock-download').get('href')
    
    return download_href, download_page.url, dur_eps

In [15]:
parse_ipleer('ABBA','Take a chance on me','4:04')

('http://data.iplayer.info/file/1822708/VFV5dVRTS2RoS2orL1FZNkQ3dENIRG9vTUpJaGZPNThHaXQxNUorMXBzWFhhMW9MUy9NK3Nza1BXNUdoVzlHdENSNmVKSmdlTW1BOU9XRkUvb2NvTjllblh4akkyUlFNOG9lVEoxdXNucGpUT2s5czlTMm9XVm81b0ZpM1R5UTU/ABBA_-_Take_A_Chance_On_Me_(iPlayer.fm).mp3',
 'https://ru123.iplayer.info/song/1822708/ABBA_-_Take_A_Chance_On_Me/',
 0)

### mp3co.ooo

In [16]:
def parse_mp3co(artist, title, duration):
    SEARCH_URL = 'https://mp3co.ooo/s/'
    SONGS_URL = 'https://mp3co.ooo'
    
    url = format_fname('{}{}%20{}/'.format(SEARCH_URL, artist, title), space_replacer='%20')
    list_page = requests.get(url=url)
    
    if list_page.status_code != 200:
        raise Exception('List_page_not_found')
    
    list_soup = BeautifulSoup(list_page.text, 'html.parser')
    tracks = list_soup.find_all('tr')
    ntracks = []
    for track_id,track in enumerate(tracks):
        try:
            a,t = map(lambda x: x.text, track.find_all('a', class_='item'))
            d = track.find('td', class_='time').text
            h = track.find('td', class_='download').a.get('href')
        except:
            a,t,d, h = None, None, None, None
        if a and t and d and h:
            if check_attributes(a,t,d,artist,title,duration):
                d = time_delta(d,duration)
                ntracks.append({'artist':a, 'title':t, 'time_delta':d, 'href':h, 'track':track})
    
    if len(ntracks)==0:
        raise Exception('No_correct_songs_found',url)
    
    ntracks = sorted(ntracks, key= lambda x: abs(x['time_delta']))
    
    download_href = ntracks[0]['href']
    dur_eps = ntracks[0]['time_delta']
    return download_href, url, dur_eps

In [17]:
parse_mp3co('abba', 'take a chance on me', '4:02')

('https://cs1.mp3co.ooo/download/73932420/NEtsZ1NSQ2IycGtkRUtON0VoMnBkRmt5SnNIMi96a2hhcFNRY3g1ZlFWQWVKZ1l1cDFGci9TY1RzVEgzTFFWZEJRcHJLK3ZkdytrYkpYTHRkQW1zTWxaVFRmVC9udGFPbDN3SUx1UStDems9/ABBA_v_ispolenii_Julie_Walters_Stellan_Skarsgard_Take_A_Chance_On_Me_(mp3co.ooo).mp3',
 'https://mp3co.ooo/s/abba%20take%20a%20chance%20on%20me/',
 0)

### patefon.net

In [18]:
def parse_patefon(artist, title, duration):
    SEARCH_URL = 'https://patefon.net/'

    url = format_fname('{}{}-{}/'.format(SEARCH_URL, artist, title), space_replacer='-')
    list_page = requests.get(url=url)
    
    if list_page.status_code != 200:
        raise Exception('List_page_not_found')
    
    list_soup = BeautifulSoup(list_page.text, 'html.parser')
    tracks = list_soup.find_all('li', class_='item')
    ntracks = []
    for track_id,track in enumerate(tracks):
        try:
            a = track.find('span', class_='artist').text
            t = track.find('span', class_='track').text
            d = track.find('div', class_='duration').text
            h = track.find('a', class_='dl').get('href')
        except:
            a,t,d, h = None, None, None, None
        if a and t and d and h:
            if check_attributes(a,t,d,artist,title,duration):
                d = time_delta(d,duration)
                ntracks.append({'artist':a, 'title':t, 'time_delta':d, 'href':h, 'track':track})
    
    if len(ntracks)==0:
        raise Exception('No_correct_songs_found',url)
    
    ntracks = sorted(ntracks, key= lambda x: abs(x['time_delta']))
    
    download_href = ntracks[0]['href']
    dur_eps = ntracks[0]['time_delta']
    return download_href, url, dur_eps

In [19]:
parse_patefon('abba', 'take a chance on me', '4:02')

('http://dl3.patefon.net/aHR0cDovL2YubXAzcG9pc2submV0L21wMy8wMDAvNjQ4LzQzNy82NDg0MzcubXAzP3RpdGxlPWFiYmEtdGFrZS1hLWNoYW5jZS1vbi1tZS5tcDM=',
 'https://patefon.net/abba-take-a-chance-on-me/',
 2)

### xgetmus.ru

In [20]:
def parse_xgetmus(artist, title, duration):
    SEARCH_URL = 'http://xgetmus.ru/s/'

    url = format_fname('{}{}+{}/'.format(SEARCH_URL, artist, title), space_replacer='+')
    list_page = requests.get(url=url)
    
    if list_page.status_code != 200:
        raise Exception('List_page_not_found')
    
    list_soup = BeautifulSoup(list_page.text, 'html.parser')
    tracks = list_soup.find_all('li', class_='item x-track track')
    ntracks = []
    for track_id,track in enumerate(tracks):
        try:
            a = track.find('span', class_='artist').text
            t = track.find('span', class_='track').text
            d = track.find('div', class_='duration').text
            h = track.find('a', class_='dl download').get('href')
        except:
            a,t,d, h = None, None, None, None
        if a and t and d and h:
            if check_attributes(a,t,d,artist,title,duration):
                d = time_delta(d,duration)
                ntracks.append({'artist':a, 'title':t, 'time_delta':d, 'href':h, 'track':track})
    
    if len(ntracks)==0:
        raise Exception('No_correct_songs_found',url)
    
    ntracks = sorted(ntracks, key= lambda x: abs(x['time_delta']))
        
    dur_eps = 0
    for track in ntracks:
        download_page = requests.get(track['href'])
        if download_page.status_code == 200:
            dur_eps = track['time_delta']
            break
    if download_page.status_code != 200:
        raise Exception('Download_page_not_found')
        
    download_soup = BeautifulSoup(download_page.text, 'html.parser')
    download_href = download_soup.find('a',class_='knopka download').get('href')
    return download_href, download_page.url, dur_eps

In [21]:
parse_xgetmus('Alan O\'Day', 'Undercover Angel', '3:33')

('http://xgetmus.ru/download/YFlpmcXkrkyG3eMrHeQ1FtlJdj2hPrIAKelFy43nwrGapL6H5BD46KEeCv6QMTtq6HnXFHolZ9WRkx56l_krBm40kmc7vAYaSIN6-jGuofY/The+Karaoke+Channel+Undercover+Angel+Originally+Performed+by+Alan+O+day+Karaoke+Version(xgetmus.ru).mp3',
 'http://xgetmus.ru/song/27614959.433004601-the-karaoke-channel-undercover-angel-originally-performed-by-alan-oday-karaoke-version',
 -1)

### vkmusic.me

In [22]:
def parse_vkmus(artist, title, duration):
    SEARCH_URL = 'http://vkmusic.me/'
    
    url = format_fname('{}{}-{}/'.format(SEARCH_URL, artist, title), space_replacer='-')
    list_page = requests.get(url=url)
    
    if list_page.status_code != 200:
        raise Exception('List_page_not_found')
    
    list_soup = BeautifulSoup(list_page.text, 'html.parser')
    tracks = list_soup.find_all('li', class_='item')
    ntracks = []
    for track_id,track in enumerate(tracks):
        try:
            a = track.find('span', class_='artist').text
            t = track.find('span', class_='track').text
            d = track.find('div', class_='duration').text
            h = track.find('a', class_='dl').get('href')
        except:
            a,t,d, h = None, None, None, None
        if a and t and d and h:
            if check_attributes(a,t,d,artist,title,duration):
                d = time_delta(d,duration)
                ntracks.append({'artist':a, 'title':t, 'time_delta':d, 'href':h, 'track':track})
    
    if len(ntracks)==0:
        raise Exception('No_correct_songs_found',url)
    
    ntracks = sorted(ntracks, key= lambda x: abs(x['time_delta']))
        
    download_href = ntracks[0]['href']
    dur_eps = ntracks[0]['time_delta']
    return download_href, url, dur_eps

In [23]:
parse_vkmus('alice cooper', 'school\'s out', '4:23')

('http://vkdlmsk2.vkmusic.me/aHR0cHM6Ly9wc3Y0LnVzZXJhcGkuY29tL2M2MTU4L3U1ODAzNjkxNi9hdWRpb3MvZDMzMzEwMTEyOTNiLm1wMz9leHRyYT12d2sxVk5wZ2JNekhhclpGeEk4M2U5RHhKR3E1YWhzUWR0UUc2YzNXcjFVU1NTYU8xNnhRUWpBNUZJU1I4RmVKRmx0QXdxcjVpaVdSdGZRYnZoWThIYjEtcE5Ra05VbmFSMHp5WTg5UFFVYVh3LWRQQ015SklHQ0owVmJGMHdDWWNRNWE4NDNXbHF4X2o4RSZ0aXRsZT1BbGljZStDb29wZXIrLStTY2hvb2xzK091dCslMjh6dnVrb2ZmLnJ1JTI5JmlkPTU4MDM2OTE2XzE2NTI3NDM3NQ==',
 'http://vkmusic.me/alice-cooper-schools-out/',
 0)

# Парсинг сайтов

Составляем список песен, которые надо загрузить

In [24]:
bb_ds = pd.DataFrame(columns=['title', 'artist', 'number', 'duration', 't_eps', 'href', 'status'])

Собираем информацию о песнях из распарсенных файлов

In [25]:
with open(BASE_DIR+'/dsets/all_billboard_songs.txt') as file:
    lines = ''.join(file.readlines())
    
    artists = re.findall(r'# artist:.+\n', lines)    
    titles = re.findall(r'# title:.+\n', lines)
    durations = re.findall(r'\d+\.\d+\send', lines)
    nums = sorted(os.listdir(BASE_DIR+'/parsed/billboard'))
    
    for a,t,d,n in tqdm_notebook(zip(artists,titles, durations, nums), desc='Adding rows', total=len(artists)):
        a = format_fname(a.split(':')[1].strip(), space_replacer='_')
        t = format_fname(t.split(':')[1].strip(), space_replacer='_')
        d = secs_to_string(d.split('\t')[0])
        n = int(n)
        bb_ds = bb_ds.append({'artist':a, 'title':t, 'duration':d, 'number':n}, ignore_index=True)

A Jupyter Widget




Проверяем,какие песни уже загружены

In [26]:
for row in tqdm_notebook(bb_ds.iterrows(), total=len(bb_ds), desc='Walking rows'):
    ind, row = row
    fname = BASE_DIR+'/audio/billboard/'
    fname += '{}_{}-{}.mp3'.format(row['number'], row['artist'], row['title'], space_replacer='_')

    if not os.path.isfile(fname):
        bb_ds.at[ind, 'status'] = 'Not loaded'
        continue
    
    try:
        audio = AudioSegment.from_mp3(fname)
        dur_real = round(audio.duration_seconds)
        del audio
        
        eps = string_to_secs(row['duration'])-dur_real
        if abs(eps)>max_eps:
            bb_ds.at[ind, 'status'] = 'Big time_delta'
        else:
            bb_ds.at[ind, 'status'] = 'Downloaded'
        bb_ds.at[ind, 't_eps'] = eps
    except Exception as e:
        #print(e)
        bb_ds.at[ind, 'status'] = 'Incorrect load'

A Jupyter Widget




Сохраняем полученную таблицу

In [27]:
songs_bb_ds_fname = BASE_DIR+'/dsets/billboard_songs_ds.csv'

In [159]:
bb_ds.to_csv(songs_bb_ds_fname)

Загрузка БД песен

In [160]:
bb_ds = pd.read_csv(songs_bb_ds_fname, index_col=0)
bb_ds.head()

Unnamed: 0,title,artist,number,duration,t_eps,href,status
0,i_dont_mind,james_brown,3,02:31,1.0,,Downloaded
1,youve_got_a_friend,roberta_flack_and_donny_hathaway,4,03:27,0.0,,Downloaded
2,the_rose,bette_midler,6,03:41,-1.0,,Downloaded
3,an_innocent_man,billy_joel,10,05:18,0.0,,Downloaded
4,lookin_for_love,johnny_lee,12,03:32,0.0,,Downloaded


In [163]:
dled_num = len(bb_ds[bb_ds['status']=='Downloaded'])
print('Загружено:\t{}\t{:.2%}'.format(dled_num, dled_num/len(bb_ds)))
print('Не загружено:\t{}\t{:.2%}'.format(len(bb_ds)-dled_num, 1-dled_num/len(bb_ds)))

Загружено:	814	91.46%
Не загружено:	76	8.54%


Считаем, что песни с погрешностью в длительности больше max_eps, не загружены

In [164]:
bb_ds.loc[bb_ds[bb_ds['t_eps']>max_eps].index,'status'] = 'Big time_delta'

Вынесение всех незагруженных песен в отдельный датасет

In [165]:
not_load = bb_ds[bb_ds['status']!='Downloaded']
not_load.shape

(76, 7)

Парсинг песен из всех записанных источников

In [167]:
%%time

not_load = bb_ds[bb_ds['status']!='Downloaded']
for row in tqdm_notebook(not_load.iterrows(), total=len(not_load), desc='Walking rows'):
    i,row = row
    # Обновляем файл датасета
    if i%5==0: bb_ds.to_csv(songs_bb_ds_fname)
    
    download_data = []
    exceptions = []
    # Загружаем ссылки на песни со всех сайтов
    for parser in [parse_ipleer, parse_mp3co, parse_patefon, parse_xgetmus, parse_vkmus]:
        try:
            download_data.append(parser(artist=row.artist, title=row.title, duration=row.duration))
        except Exception as e:
            exceptions.append(e)
    # Сортируем песни по совпадению длительностей
    if len(download_data) > 0:
        download_data = sorted(download_data, key=lambda x: abs(x[2]))
        download_href, page_url, dur_eps = download_data[0]
    else:
        bb_ds.at[i,'status'] = exceptions
        continue
    # Загружаем файл
    try:
        is_loaded_correct = False
        pbar = tqdm_notebook(desc='Trying to load', total=len(download_data), leave=False)
        for idx, d_data in enumerate(download_data):
            download_href, page_url, dur_eps = d_data
            for ind_trying in range(2):
                download_file(artist=row.artist, title=row.title, number=row.number, href=download_href,\
                             ds_name = 'billboard')
                try:
                    is_loaded_correct = check_correct_dl(artist=row.artist, title=row.title,\
                                                     number=row.number, duration=row.duration, eps=dur_eps,\
                                                        ds_name='billboard')
                except:
                    is_loaded_correct = False
                if is_loaded_correct:
                    break
            if is_loaded_correct:
                pbar.update(len(download_data)-idx)
                pbar.close()
                break
            pbar.update(1)
        pbar.close()
        bb_ds.at[i,'status']='Downloaded'
        bb_ds.at[i,'href']=page_url
        bb_ds.at[i,'t_eps']=dur_eps
    except Exception as e:
        bb_ds.at[i, 'status'] = e.args[0]

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget


CPU times: user 1min 28s, sys: 15.3 s, total: 1min 43s
Wall time: 16min 15s


Сохраняем данные в файл

In [168]:
bb_ds.to_csv(songs_bb_ds_fname)

In [175]:
dled_num = len(bb_ds[bb_ds['status']=='Downloaded'])
print('Загружено:\t{}\t{:.2%}'.format(dled_num, dled_num/len(bb_ds)))
print('Не загружено:\t{}\t{:.2%}'.format(len(bb_ds)-dled_num, 1-dled_num/len(bb_ds)))

Загружено:	822	92.36%
Не загружено:	68	7.64%


# Парсим песни из 'не billboard' датасета

Создаём датасет

In [124]:
cl_ds = pd.DataFrame(columns=['title', 'artist', 'number', 'duration', 't_eps', 'href', 'status'])

Добавляем все песни TheBeatles

In [125]:
for album in os.listdir(BASE_DIR+'/parsed/chordlab/The_Beatles'):
    alb_num = album.split('_')[0].replace('CD','')
    dname = BASE_DIR+'/parsed/chordlab/The_Beatles/'+album+'/'
    for fname in os.listdir(dname):
        with open(dname+fname) as f:
            duration = secs_to_string(f.readlines()[-1].split(' ')[1])            
        #print(fname)
        song_num = fname[:2]
        title = fname[5:-4]
        cl_ds = cl_ds.append({'title':title, 'artist':'the_beatles',
                              'number':int(alb_num+song_num), 'duration':duration}, ignore_index=True)

Добавляем все песни Queen

In [127]:
for dname in [BASE_DIR+'/parsed/chordlab/Queen_Greatest_Hits_1/',
              BASE_DIR+'/parsed/chordlab/Queen_Greatest_Hits_2/']:
    for fname in os.listdir(dname):
        with open(dname+fname) as f:
            duration = secs_to_string(f.readlines()[-1].split('\t')[1])
        song_num = dname[-2]+fname[:2]
        title = fname[3:-4]
        cl_ds = cl_ds.append({'title':title, 'artist':'queen',
                                  'number':int(song_num), 'duration':duration}, ignore_index=True)

Добавляем все песни Carol King

In [128]:
dname = BASE_DIR+'/parsed/chordlab/Carol_King/'
for fname in os.listdir(dname):
    with open(dname+fname) as f:
        duration = secs_to_string(f.readlines()[-1].split('\t')[1])
    song_num = fname[:2]
    title = fname[3:-4]
    cl_ds = cl_ds.append({'title':title, 'artist':'carol_king',
                          'number':int(song_num), 'duration':duration}, ignore_index=True)

Проверяем,какие песни уже загружены

In [130]:
for row in tqdm_notebook(cl_ds.iterrows(), total=len(cl_ds), desc='Walking rows'):
    ind, row = row
    fname = BASE_DIR+'/audio/chordlab/'
    fname += format_fname('{}_{}-{}.mp3'.format(row['number'], row['artist'], row['title']))

    if not os.path.isfile(fname):
        cl_ds.at[ind, 'status'] = 'Not loaded'
        continue
    
    try:
        audio = AudioSegment.from_mp3(fname)
        dur_real = round(audio.duration_seconds)
        del audio
        
        eps = string_to_secs(row['duration'])-dur_real
        if abs(eps)>max_eps:
            cl_ds.at[ind, 'status'] = 'Big time_delta'
        else:
            cl_ds.at[ind, 'status'] = 'Downloaded'
        cl_ds.at[ind, 't_eps'] = eps
    except Exception as e:
        #print(e)
        cl_ds.at[ind, 'status'] = 'Incorrect load'

A Jupyter Widget

In [131]:
print(cl_ds[cl_ds['status']=='Downloaded'].shape)
print(cl_ds[cl_ds['status']!='Downloaded'].shape)

(200, 7)
(7, 7)


Сохраняем полученную таблицу

In [134]:
cl_ds[cl_ds['status']!='Downloaded']

Unnamed: 0,title,artist,number,duration,t_eps,href,status
47,ob-la-di_ob-la-da,the_beatles,10104,03:09,,,Not loaded
51,im_so_tired,the_beatles,10110,02:03,,,Incorrect load
71,ill_be_back,the_beatles,313,02:20,,,Incorrect load
86,im_looking_through_you,the_beatles,610,02:28,,,Incorrect load
100,kansas_city-_hey_hey_hey_hey,the_beatles,407,02:33,,,Not loaded
128,when_im_sixty-four,the_beatles,809,02:38,,,Not loaded
136,theres_a_place,the_beatles,113,01:53,,,Incorrect load


In [135]:
songs_cl_ds_fname = BASE_DIR+'/dsets/chordlab_songs_ds.csv'

In [136]:
cl_ds.to_csv(songs_cl_ds_fname)

Загрузка БД песен

In [137]:
cl_ds = pd.read_csv(songs_cl_ds_fname, index_col=0)
cl_ds.head()

Unnamed: 0,title,artist,number,duration,t_eps,href,status
0,your_mother_should_know,the_beatles,905,02:30,0.0,,Downloaded
1,baby_youre_a_rich_man,the_beatles,910,03:04,0.0,,Downloaded
2,flying,the_beatles,903,02:17,0.0,,Downloaded
3,hello_goodbye,the_beatles,907,03:31,0.0,,Downloaded
4,i_am_the_walrus,the_beatles,906,04:37,0.0,,Downloaded


In [138]:
dled_num = len(cl_ds[cl_ds['status']=='Downloaded'])
print('Загружено:\t{}\t{:.2%}'.format(dled_num, dled_num/len(cl_ds)))
print('Не загружено:\t{}\t{:.2%}'.format(len(cl_ds)-dled_num, 1-dled_num/len(cl_ds)))

Загружено:	200	96.62%
Не загружено:	7	3.38%


Считаем, что песни с погрешностью в длительности больше max_eps, не загружены

In [139]:
cl_ds.loc[cl_ds[cl_ds['t_eps']>max_eps].index,'status'] = 'Big time_delta'

Вынесение всех незагруженных песен в отдельный датасет

In [140]:
not_load = cl_ds[cl_ds['status']!='Downloaded']
not_load.shape

(7, 7)

Парсинг песен из всех записанных источников

In [148]:
%%time

not_load = cl_ds[cl_ds['status']!='Downloaded']
for row in tqdm_notebook(not_load.iterrows(), total=len(not_load), desc='Walking rows'):
    i,row = row
    # Обновляем файл датасета
    if i%5==0: cl_ds.to_csv(songs_cl_ds_fname)
    
    download_data = []
    exceptions = []
    # Загружаем ссылки на песни со всех сайтов
    for parser in [parse_ipleer, parse_mp3co, parse_patefon, parse_xgetmus, parse_vkmus]:
        try:
            download_data.append(parser(artist=row.artist, title=row.title, duration=row.duration))
        except Exception as e:
            exceptions.append(e)
    # Сортируем песни по совпадению длительностей
    if len(download_data) > 0:
        download_data = sorted(download_data, key=lambda x: abs(x[2]))
        download_href, page_url, dur_eps = download_data[0]
    else:
        cl_ds.at[i,'status'] = exceptions
        continue
    # Загружаем файл
    try:
        is_loaded_correct = False
        pbar = tqdm_notebook(desc='Trying to load', total=len(download_data), leave=False)
        for idx, d_data in enumerate(download_data):
            download_href, page_url, dur_eps = d_data
            for ind_trying in range(2):
                download_file(artist=row.artist, title=row.title, number=row.number, href=download_href,\
                             ds_name = 'chordlab')
                try:
                    is_loaded_correct = check_correct_dl(artist=row.artist, title=row.title,\
                                                     number=row.number, duration=row.duration, eps=dur_eps,\
                                                        ds_name='chordlab')
                except:
                    is_loaded_correct = False
                if is_loaded_correct:
                    break
            if is_loaded_correct:
                pbar.update(len(download_data)-idx)
                pbar.close()
                break
            pbar.update(1)
        pbar.close()
        cl_ds.at[i,'status']='Downloaded'
        cl_ds.at[i,'href']=page_url
        cl_ds.at[i,'t_eps']=dur_eps
    except Exception as e:
        cl_ds.at[i, 'status'] = e.args[0]

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget


CPU times: user 13.7 s, sys: 3.21 s, total: 16.9 s
Wall time: 2min 25s


Сохраняем данные в файл

In [149]:
cl_ds.to_csv(songs_cl_ds_fname)

In [150]:
dled_num = len(cl_ds[cl_ds['status']=='Downloaded'])
print('Загружено:\t{}\t{:.2%}'.format(dled_num, dled_num/len(cl_ds)))
print('Не загружено:\t{}\t{:.2%}'.format(len(cl_ds)-dled_num, 1-dled_num/len(cl_ds)))

Загружено:	207	100.00%
Не загружено:	0	0.00%


In [151]:
cl_ds[cl_ds['status']!='Downloaded'].head()

Unnamed: 0,title,artist,number,duration,t_eps,href,status
