# Описание ноутбука
Парсинг сайтов и скачивание mp3 аудиофайлов с интернета

In [1]:
import os
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

In [2]:
from chord_rec_lib import format_name, find_files

In [3]:
import importlib

In [4]:
from chord_rec_lib import dnames

dnames['RAW_SONGS_DIR'] = 'raw_songs'

HEAD_DIR = '..'
for d in dnames:
    dnames[d] = os.path.join(HEAD_DIR, dnames[d])

In [5]:
not_include = ['cover', 'remix', 'live', 'mix'] # Запрещённые слова для названий песен todo: 'Live'?
max_eps = 3 # Максимальное отклонение от заданной длины дорожки (в секундах)

In [6]:
def secs_to_string(secs):
    m = str(secs//60)
    if len(m)==1:
        m = '0'+m
    s = str(secs%60)
    if len(s)==1:
        s = '0'+s
    return ':'.join([m,s])

In [7]:
def string_to_secs(string):
    m,s = map(int,string.split(':'))
    return m*60+s

In [8]:
def parse_duration(track_html, verbose=False):
    if verbose: print('parsing_duration')
    d = track_html.find('em')
    while True:
        try:
            d.div.decompose()
        except:
            break
    if verbose: print('parsing_duration point1')
    while True:
        try:
            d.a.decompose()
        except:
            break
    if verbose: print('parsing_duration point2')
    return d.text.split()[0]

In [9]:
def download_file(id, href):
    fname = os.path.join(dnames['RAW_SONGS_DIR'], '{}.mp3'.format(id))
    try:
        !wget -q -nc -t 1 -O {fname} '{href}'
    except:
        raise Exception('Found but not downloaded')

In [10]:
def parse_ipleer(artist, title, duration, verbose=False):
    artist, title = artist.lower(), title.lower()
    
    SEARCH_URL = 'https://ipleer.fm/search/q/'
    SONGS_URL = 'https://ipleer.fm'
    
    url = SEARCH_URL+format_name('{}+{}/'.format(artist, title), space_replacer='+')
    list_page = requests.get(url=url)
    
    if list_page.status_code != 200:
        raise Exception('List_page_not_found')
    
    list_soup = BeautifulSoup(list_page.text, 'html.parser')
    tracks = list_soup.find_all('li',class_='track')
    ntracks = []
    for track_id,track in enumerate(tracks):
        try:
            a = format_name(track.find('span',class_='cartist').text, space_replacer=' ')
            t = format_name(track.find('span',class_='ctitle').text, space_replacer=' ')
            d = parse_duration(track)
            h = track.find('a',class_='playlist-down').get('href')
        except:
            a,t,d, h = None, None, None, None
        if a and t and d and h:
            a_t = '{a} {t}'.format(a=a, t=t)
            if any(map(lambda x: x in a_t.split(), not_include)):
                if verbose:
                    print('checking 1')
                    print(a_t)
                    print(artist, title)
                continue
            # Если кавер или ремикс (автор и название могут быть перепутаны местами)
            #if len(list(filter(lambda x: x in t.lower(), not_include)))>0\
            # or len(list(filter(lambda x: x in a.lower(), not_include)))>0:
            #    continue
            # Если автор не совпадает с заданным (автор и название могут быть перепутаны местами)
            if not all(map(lambda x: x in a_t.split(), title.split('_'))) \
                or not all(map(lambda x: x in a_t.split(), artist.split('_'))):
                if verbose:
                    print('checking 2')
                    print('a t:', a_t, a_t.split())
                    print(artist.split('_'), title.split('_'))
    #            print('1.0)', list(map(lambda x: x in a_t.split(),  title.split())))
    #            print('1.5)', all(map(lambda x: x in a_t.split(),  title.split())))
    #            print('2.0)', list(map(lambda x: x in a_t.split(), artist.split())))
    #            print('2.5)', all(map(lambda x: x in a_t.split(), artist.split())))
    #            print(title.split())
                continue
            #if len(list(filter(lambda x: x in a, artist.split()))) != len(artist.split())\
            # and len(list(filter(lambda x: x in t, artist.split()))) != len(artist.split()):
            #    continue
            # Если название не совпадает с заданным (автор и название могут быть перепутаны местами)
            #if len(list(filter(lambda x: x in t, title.split()))) != len(title.split())\
            # and len(list(filter(lambda x: x in a, title.split()))) != len(title.split()):
            #    continue
            try:
                d = abs(string_to_secs(d)-string_to_secs(duration))
            except:
                continue
            if d > max_eps:
                continue
           
            ntracks.append({'artist':a, 'title':t, 'time_delta':d, 'href':h, 'track':track})
    
    if len(ntracks)==0:
        raise Exception('No_correct_songs_found',url)
    
    ntracks = sorted(ntracks, key= lambda x: x['time_delta'])
    
    dur_eps = 0
    for track in ntracks:
        download_page = requests.get(SONGS_URL+track['href'])
        if download_page.status_code == 200:
            dur_eps = track['time_delta']
            break
    if download_page.status_code != 200:
        raise Exception('Download_page_not_found')
        
    download_soup = BeautifulSoup(download_page.text, 'html.parser')
    download_href = download_soup.find('a',class_='onesongblock-download').get('href')
    
    return download_href, download_page.url, dur_eps

In [11]:
ds = pd.read_csv(os.path.join(dnames['CSVS_DIR'], 'united_ds_songlist.csv'), index_col=0)
ds['href']='nan'
ds['status']='nan'
ds['t_eps']=None

In [12]:
ds.to_csv(os.path.join(dnames['CSVS_DIR'], 'downloaded_songlist.csv'))

In [13]:
ds.head()

Unnamed: 0,artist,duration,number,orig_ds,orig_fname,title,id,href,status,t_eps
0,james_brown,151,3.0,Billboard,0003\salami_chords.txt,i_dont_mind,0,,,
1,roberta_flack_and_donny_hathaway,207,4.0,Billboard,0004\salami_chords.txt,youve_got__friend,1,,,
2,bette_midler,221,6.0,Billboard,0006\salami_chords.txt,rose,2,,,
3,billy_joel,318,10.0,Billboard,0010\salami_chords.txt,innocent_man,3,,,
4,johnny_lee,212,12.0,Billboard,0012\salami_chords.txt,lookin_for_love,4,,,


# Checking songs correctfullness

In [14]:
files_downld = find_files(dname=dnames['RAW_SONGS_DIR'], frmt='mp3')

In [15]:
ds_downld = pd.read_csv(os.path.join(dnames['CSVS_DIR'], 'downloaded_songlist.csv'), index_col=0)
ds_downld.head()

Unnamed: 0,artist,duration,number,orig_ds,orig_fname,title,id,href,status,t_eps
0,james_brown,151,3.0,Billboard,0003\salami_chords.txt,i_dont_mind,0,,,
1,roberta_flack_and_donny_hathaway,207,4.0,Billboard,0004\salami_chords.txt,youve_got__friend,1,,,
2,bette_midler,221,6.0,Billboard,0006\salami_chords.txt,rose,2,,,
3,billy_joel,318,10.0,Billboard,0010\salami_chords.txt,innocent_man,3,,,
4,johnny_lee,212,12.0,Billboard,0012\salami_chords.txt,lookin_for_love,4,,,


## Подход к сверке файлов

In [16]:
import re

In [17]:
ds_downld.head()

Unnamed: 0,artist,duration,number,orig_ds,orig_fname,title,id,href,status,t_eps
0,james_brown,151,3.0,Billboard,0003\salami_chords.txt,i_dont_mind,0,,,
1,roberta_flack_and_donny_hathaway,207,4.0,Billboard,0004\salami_chords.txt,youve_got__friend,1,,,
2,bette_midler,221,6.0,Billboard,0006\salami_chords.txt,rose,2,,,
3,billy_joel,318,10.0,Billboard,0010\salami_chords.txt,innocent_man,3,,,
4,johnny_lee,212,12.0,Billboard,0012\salami_chords.txt,lookin_for_love,4,,,


In [18]:
ds_downld['raw_song_fname'] = None
matched_songs = pd.DataFrame()
for fname in tqdm_notebook(files_downld):
    formd_fname = format_name(fname)
    #if any([ni in formd_fname for ni in not_include]):
    #    print(fname)
    #    continue
    #print(formd_fname)
    mpds = list(map(lambda row: (row[1]['artist'] in formd_fname) and row[1]['title'] in formd_fname.split('\\')[-1], ds_downld.iterrows()))
    ds = ds_downld[mpds]
    if ds.shape[0]>0:
        # Иногда в ds попадают нескольско строк, т.к. row[title] - также часть другого названия.
        # Правильный выбор - песня с наидлиннейшим названием 
        ds = ds.iloc[np.argmax(map(lambda x: len(x),list(ds['title'])))]
        ds_downld.loc[ds['id'], 'raw_song_fname'] = fname[len(dnames['RAW_SONGS_DIR'])+1:]
        matched_songs = matched_songs.append(ds_downld.loc[ds['id']])    

A Jupyter Widget




# ПРОВЕРЯТЬ ДЛИТЕЛЬНОСТЬ ФАЙЛОВ В СКРИПТЕ ВЫШЕ

In [19]:
from pydub import AudioSegment



In [20]:
song_fname = matched_songs.iloc[0]['raw_song_fname']

In [21]:
os.path.isfile(os.path.join(dnames['RAW_SONGS_DIR'], song_fname))

True

In [29]:
os.path.join(dnames['RAW_SONGS_DIR'], song_fname)

'..\\raw_songs\\Queen\\Queen  Greatest Hits 1 (2011 Remastered)\\01 Bohemian Rhapsody.mp3'

In [31]:
os.path.isfile(r'.\\01 Bohemian Rhapsody.mp3')

True

In [32]:
AudioSegment.from_mp3("01 Bohemian Rhapsody.mp3")



FileNotFoundError: [WinError 2] Не удается найти указанный файл

In [30]:
AudioSegment.from_mp3(file=r'..\\raw_songs\\Queen\\Queen  Greatest Hits 1 (2011 Remastered)\\01 Bohemian Rhapsody.mp3')



FileNotFoundError: [WinError 2] Не удается найти указанный файл