In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
def secs_to_string(secs):
    m = str(secs/60)
    if len(m)==1:
        m = '0'+m
    s = str(secs%60)
    if len(s)==1:
        s = '0'+s
    return ':'.join([m,s])

In [3]:
def string_to_secs(string):
    m,s = map(int,string.split(':'))
    return m*60+s

In [4]:
def parse_duration(track_html):
    d = track.find('em')
    while True:
        try:
            d.div.decompose()
        except:
            break
    while True:
        try:
            d.a.decompose()
        except:
            break
    return d.text.split()[0]

In [5]:
def format_fname(s, space_replacer='_'):
    s = s.strip(' ')
    for c in ['(',')','\'']:
        s = s.replace(c,'')
    s = s.replace(' ', space_replacer)
    return s

In [6]:
ds = pd.read_csv('McGill-Billboard (2)/songs.csv', index_col=0)
ds['href']='nan'
ds['status']='nan'

In [7]:
ds = pd.read_csv('McGill-Billboard (2)/songs_downloaded.csv',index_col=0)

In [14]:
ds.to_csv('McGill-Billboard (2)/songs.csv')

In [8]:
ds.head()

Unnamed: 0,title,artist,number,duration,t_eps,href,status
0,Chicago,25 or 6 to 4,176,4:50,0.0,https://ipleer.fm/song/101272384/Chicago_Chica...,Downloaded
1,Sukiyaki,A Taste Of Honey,1286,3:43,1.0,https://ipleer.fm/song/2787937/A_Taste_Of_Hone...,Downloaded
2,Chiquitita,ABBA,183,5:25,0.0,https://ipleer.fm/song/1958012/ABBA_-_Chiquitita/,Downloaded
3,"Knowing Me, Knowing You",ABBA,231,4:04,1.0,https://ipleer.fm/song/1823039/ABBA_-_Knowing_...,Downloaded
4,Honey Honey,ABBA,902,2:54,0.0,https://ipleer.fm/song/7201410/ABBA_-_Honey_Ho...,Downloaded


In [9]:
not_load = ds[ds['status']!='Downloaded']

In [11]:
not_load = ds[ds.index > 500]

In [12]:
not_load[:20]

Unnamed: 0,title,artist,number,duration,t_eps,href,status
501,Come And Get Your Love,Redbone,315,3:34,,,
502,"Rock and Roll, Hoochie Koo",Rick Derringer,1267,8:26,,,
503,Give It To Me Baby,Rick James,480,3:54,,,
504,Super Freak Part One,Rick James,812,3:22,,,
505,Jessie's Girl,Rick Springfield,217,3:17,,,
506,Don't Talk To Strangers,Rick Springfield,865,3:01,,,
507,The Way You Do The Things You Do,Rita Coolidge,291,3:39,,,
508,Your Love Has Lifted Me Higher,Rita Coolidge,336,4:03,,,
509,Fever,Rita Coolidge,692,3:30,,,
510,We're All Alone,Rita Coolidge,806,3:40,,,


In [13]:
SEARCH_URL = 'https://ipleer.fm/search/q/'
SONGS_URL = 'https://ipleer.fm'

In [14]:
not_include = ['cover', 'remix', 'live'] # Запрещённые слова для названий песен todo: 'Live'?
max_eps = 10 # Максимальное отклонение от заданной длины дорожки (в секундах)

In [15]:
%%time
#not_load = ds[ds['status']!='Downloaded']

for row in not_load.iterrows():
    i,row = row
    print i,
    if i%5==0:
        ds.to_csv('McGill-Billboard (2)/songs_downloaded.csv')
    url = format_fname('{}{}+{}/'.format(SEARCH_URL, row.artist, row.title), space_replacer='+')
    list_page = requests.get(url=url)
    
    if list_page.status_code != 200:
        ds.at[i,'status']='List_page_not_found'
        print 'List_page_not_found'
        continue
    
    list_soup = BeautifulSoup(list_page.text, 'html.parser')
    tracks = list_soup.find_all('li',class_='track')
    ntracks = []
    for track_id,track in enumerate(tracks):
        try:
            a = track.find('span',class_='cartist').text
            t = track.find('span',class_='ctitle').text
            d = parse_duration(track)
            h = track.find('a',class_='playlist-down').get('href')
        except Exception as e:
            a,t,d, h = None, None, None, None
        if a and t and d and h:
            # Если кавер или ремикс (автор и название могут быть перепутаны местами)
            if len(filter(lambda x: x in t.lower(), not_include))>0\
             or len(filter(lambda x: x in a.lower(), not_include))>0:
                continue
        
            # Если автор не совпадает с заданным (автор и название могут быть перепутаны местами)
            if len(filter(lambda x: x in a, row.artist.split())) != len(row.artist.split())\
             and len(filter(lambda x: x in t, row.artist.split())) != len(row.artist.split()):
                continue

            # Если название не совпадает с заданным (автор и название могут быть перепутаны местами)
            if len(filter(lambda x: x in t, row.title.split())) != len(row.title.split())\
             and len(filter(lambda x: x in a, row.title.split())) != len(row.title.split()):
                continue
            
            try:
                d = abs(string_to_secs(d)-string_to_secs(row.duration))
            except:
                continue
            if d > max_eps:
                continue
           
            ntracks.append({'artist':a, 'title':t, 'time_delta':d, 'href':h, 'track':track})
    
    if len(ntracks)==0:
        ds.at[i,'status']='No_correct_songs_found'
        ds.at[i,'href']=url
        print 'No_correct_songs_found'
        continue
    
    ntracks = sorted(ntracks, key= lambda x: x['time_delta'])
    
    time_eps = 0
    for track in ntracks:
        download_page = requests.get(SONGS_URL+track['href'])
        if download_page.status_code == 200:
            time_eps = track['time_delta']
            break
    if download_page.status_code != 200:
        ds.at[i,'status']='Download_page_not_found'
        print 'Download_page_not_found'
        continue
        
    download_soup = BeautifulSoup(download_page.text, 'html.parser')
    download_href = download_soup.find('a',class_='onesongblock-download').get('href')
    dname = 'audio/billboard/'+'0'*(4-len(str(row.number)))+str(row.number)+'/'
    fname = format_fname('{}-{}.mp3'.format(row.artist,row.title))
    
    try:
        !mkdir -p {dname}
    except:
        pass
    try:
        !wget -q -nc -t 1 -O {dname+fname} '{download_href}'
    except:
        ds.at[i,'status']='Found but not downloaded'
        print 'Found but not downloaded'
        continue
    ds.at[i,'status']='Downloaded'
    ds.at[i,'href']=download_page.url
    ds.at[i,'t_eps']=time_eps
    print '+'

501 +
502 No_correct_songs_found
503 +
504 No_correct_songs_found
505 +
506 +
507 +
508 +
509 +
510 +
511 +
512 +
513 +
514 +
515 List_page_not_found
516 No_correct_songs_found
517 No_correct_songs_found
518 +
519 +
520 +
521 List_page_not_found
522 +
523 No_correct_songs_found
524 +
525 +
526 +
527 List_page_not_found
528 +
529 No_correct_songs_found
530 List_page_not_found
531 List_page_not_found
532 +
533 +
534 +
535 +
536 +
537 +
538 List_page_not_found
539 List_page_not_found
540 +
541 +
542/bin/sh: 1: _Johnny-Sleep_Walk.mp3: not found
 +
543 +
544 +
545 +
546/bin/sh: 1: _Garfunkel-Cecilia.mp3: not found
 +
547/bin/sh: 1: _Garfunkel-Fakin_it.mp3: not found
 +
548/bin/sh: 1: _Garfunkel-The_Sounds_Of_Silence.mp3: not found
 +
549/bin/sh: 1: _Garfunkel-El_Condor_Pasa.mp3: not found
 +
550 No_correct_songs_found
551 +
552 List_page_not_found
553 List_page_not_found
554 +
555 +
556 +
557 +
558/bin/sh: 1: _Cher-All_I_Ever_Need_Is_You.mp3: not found
 +
559/bin/sh: 1: _Cher-A_Cowboys_Work

In [16]:
ds.to_csv('McGill-Billboard (2)/songs_downloaded.csv')