In [1]:
#import necessary libraries
import requests
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import pprint
import numpy as np
import pandas as pd
import time
import pickle

In [2]:
async def getPatterns(url): #function to parse for strum patterns at given url
    start_time = time.time()
    asession = AsyncHTMLSession() #creates Async object
    
    #print("Requesting HTML")
    r = await asession.get(url,timeout=30) #requests website from server and waits for response
    #r.html.render(timeout=20)
    
    #print("Get Website --- %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    
    #print("Rendering JavaScript")
    site = await r.html.arender(timeout=30) #renders the javascript, NOTE: Currently the speed bottleneck as the javascript is taking too long to render
    #site = r
    #print("Render Website --- %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    
    soup = BeautifulSoup(r.html.html, features='lxml') #creates BeautifulSoup object which will parse the html
    body = soup.find(class_='js-page js-global-wrapper') #finds main body of website which contains the strum patterns
    patterns = body.find_all(class_='_2u0o0') #every strum pattern is within a class called '_2u0o0'
    
    #print("Parse With Soup --- %s seconds ---" % (time.time() - start_time))
    
    strumPatterns = [] #will hold all strum patterns for song with 1=downstrum, 2=upstrum, 0=skip
    
    start_time = time.time()
    
    for pattern in patterns: #iterate through all strum patterns scraped
        currentPattern = [] #will hold current pattern
        count = 0
        for child in pattern.children: #iterate through all strums within the pattern
            className = child['class'] #get class name of the strum to determine if its upstrum, downstrum, or neither
            if '_1yupz' in className: #if it contains this string, it is an downstrum so append a 1
                currentPattern.append(1)
            elif '_3N-at' in className: #if it contains this string, it is an upstrum so append a 2
                currentPattern.append(2)
            else:   #if it contains this string, it is neither so append a 0
                currentPattern.append(0)

        strumPatterns.append(currentPattern) #add current pattern to list of all patterns in song
    
    #print("Parse Strum Patterns --- %s seconds ---" % (time.time() - start_time)) 
    await r.session.close() #closes chromium process
    await asession.close() #CHANGE MADE
    return strumPatterns #returns array of strum patterns, with each row being a strum pattern

In [3]:
async def scrapePatterns(numSongs): 
    asession = AsyncHTMLSession() #creates Async object
    page = 1 #starts on page 1 of songs website
    URL = 'https://www.ultimate-guitar.com/explore?type[]=Chords&page='
    count = 0 #will hold # of songs checked
    data = []
    while count < numSongs and page <= 20: #each page holds 50 songs so sets limits on number of pages checked
        r = await asession.get(URL+str(page),timeout=30) #requests website from server and waits for response
        
        site = await r.html.arender(timeout=30) #renders javascript on page

        soup = BeautifulSoup(r.html.html, features='lxml') #creates BeautifulSoup object which will parse the html
        songs = soup.find_all(class_='_2KJtL _1mes3 kWOod') #'_1HdUE' is the class that holds the url for each song
        for song in songs: #iterates through all songs on page
            try: #try except statements in case javascript rendering times out
                #print(song['href'])
                patterns = await getPatterns(song['href']) #call getPatterns using url of this song to get strum patterns
                data.append((song.text,song['href'],patterns)) #add tuple containing song name, url, and strum patterns to list
            except:
                print('Rendering Timeout after',count) 
                
            count += 1 #increase count on # of songs scraped
            if count > numSongs: #if we've scraped enough songs, break and return
                break
            print('Scraping Song #',count)
        #await r.session.close() #closes chromium process for page
        page += 1 #after all songs scraped from page, increment page count to access next page of songs
    await r.session.close()
    await asession.close() #closes AnsyncHTMLSession
    return data

In [4]:
data = await scrapePatterns(960)

Scraping Song # 1
Scraping Song # 2
Scraping Song # 3
Scraping Song # 4
Scraping Song # 5
Scraping Song # 6
Scraping Song # 7
Scraping Song # 8
Scraping Song # 9
Scraping Song # 10
Scraping Song # 11
Scraping Song # 12
Scraping Song # 13
Scraping Song # 14
Scraping Song # 15
Scraping Song # 16
Scraping Song # 17
Scraping Song # 18
Scraping Song # 19
Scraping Song # 20
Scraping Song # 21
Scraping Song # 22
Scraping Song # 23
Scraping Song # 24
Scraping Song # 25
Scraping Song # 26
Scraping Song # 27
Scraping Song # 28
Scraping Song # 29
Scraping Song # 30
Scraping Song # 31
Scraping Song # 32
Scraping Song # 33
Scraping Song # 34
Scraping Song # 35
Scraping Song # 36
Scraping Song # 37
Scraping Song # 38
Scraping Song # 39
Scraping Song # 40
Scraping Song # 41
Scraping Song # 42
Scraping Song # 43
Scraping Song # 44
Scraping Song # 45
Scraping Song # 46
Scraping Song # 47
Scraping Song # 48
Scraping Song # 49
Scraping Song # 50
Scraping Song # 51
Scraping Song # 52
Scraping Song # 53
Sc

Scraping Song # 417
Scraping Song # 418
Scraping Song # 419
Scraping Song # 420
Scraping Song # 421
Scraping Song # 422
Scraping Song # 423
Scraping Song # 424
Scraping Song # 425
Scraping Song # 426
Scraping Song # 427
Scraping Song # 428
Scraping Song # 429
Scraping Song # 430
Scraping Song # 431
Scraping Song # 432
Scraping Song # 433
Scraping Song # 434
Scraping Song # 435
Scraping Song # 436
Scraping Song # 437
Scraping Song # 438
Scraping Song # 439
Scraping Song # 440
Scraping Song # 441
Scraping Song # 442
Scraping Song # 443
Scraping Song # 444
Scraping Song # 445
Scraping Song # 446
Scraping Song # 447
Scraping Song # 448
Scraping Song # 449
Scraping Song # 450
Scraping Song # 451
Scraping Song # 452
Scraping Song # 453
Scraping Song # 454
Scraping Song # 455
Scraping Song # 456
Scraping Song # 457
Scraping Song # 458
Scraping Song # 459
Scraping Song # 460
Scraping Song # 461
Scraping Song # 462
Scraping Song # 463
Scraping Song # 464
Scraping Song # 465
Scraping Song # 466


Scraping Song # 826
Scraping Song # 827
Scraping Song # 828
Scraping Song # 829
Scraping Song # 830
Scraping Song # 831
Scraping Song # 832
Scraping Song # 833
Scraping Song # 834
Scraping Song # 835
Scraping Song # 836
Scraping Song # 837
Scraping Song # 838
Scraping Song # 839
Scraping Song # 840
Scraping Song # 841
Scraping Song # 842
Scraping Song # 843
Scraping Song # 844
Scraping Song # 845
Scraping Song # 846
Scraping Song # 847
Scraping Song # 848
Scraping Song # 849
Scraping Song # 850
Scraping Song # 851
Scraping Song # 852
Scraping Song # 853
Scraping Song # 854
Scraping Song # 855
Scraping Song # 856
Scraping Song # 857
Scraping Song # 858
Scraping Song # 859
Scraping Song # 860
Scraping Song # 861
Scraping Song # 862
Scraping Song # 863
Scraping Song # 864
Scraping Song # 865
Scraping Song # 866
Scraping Song # 867
Scraping Song # 868
Scraping Song # 869
Scraping Song # 870
Scraping Song # 871
Scraping Song # 872
Scraping Song # 873
Scraping Song # 874
Scraping Song # 875


In [5]:
dataframe = pd.DataFrame(data,columns=['Song','URL','StrumPattern'])
pickle.dump(dataframe, open( "ScrapedSongs2.p", "wb" ) )
dataframe

Unnamed: 0,Song,URL,StrumPattern
0,Perfect,https://tabs.ultimate-guitar.com/tab/ed-sheera...,"[[1, 1, 1, 1, 1, 1], [1, 0, 0, 1, 0, 0, 1, 0, ..."
1,Cant Help Falling In Love,https://tabs.ultimate-guitar.com/tab/elvis-pre...,"[[1, 1, 1, 1, 1, 1]]"
2,Hallelujah (ver 2),https://tabs.ultimate-guitar.com/tab/jeff-buck...,"[[1, 0, 1, 2, 1, 2, 1, 0, 1, 2, 1, 2], [1, 0, ..."
3,Snowman (ver 3),https://tabs.ultimate-guitar.com/tab/sia/snowm...,"[[1, 0, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1]]"
4,Let Her Go,https://tabs.ultimate-guitar.com/tab/passenger...,"[[1, 0, 0, 0, 1, 2, 1, 2, 1, 0, 0, 0, 1, 2, 1,..."
...,...,...,...
955,Brazil,https://tabs.ultimate-guitar.com/tab/declan-mc...,"[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1,..."
956,8,https://tabs.ultimate-guitar.com/tab/billie-ei...,"[[1, 0, 0, 1, 0, 2, 0, 0, 2, 1, 0, 2, 1, 0, 0,..."
957,Champion Live,https://tabs.ultimate-guitar.com/tab/bethel-li...,"[[1, 0, 1, 2, 1, 2, 1, 0, 1, 2, 1, 2]]"
958,Amar Dehokhan,https://tabs.ultimate-guitar.com/tab/odd-signa...,"[[1, 0, 1, 2, 1, 2, 1, 2]]"


In [15]:
def matchPatterns(dataframe, query):
    matches = []
    for num, entries in dataframe.iterrows():
    #print(entries[1],'\n')
        for pattern in entries[2]:
            if pattern == query:
                matches.append((entries[0],entries[1]))
    result = pd.DataFrame(matches,columns=['Song','URL'])
    return result

In [18]:
query1 = [1, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0, 1, 0, 1, 2]
query2 = [1,0,1,2,0,2,1,2]
query3 = [1,1,1,0,1,2]
test = pickle.load( open( "ScrapedSongs2.p", "rb" ) )
matches = matchPatterns(test,query1)
matches

Unnamed: 0,Song,URL
0,Dont Look Back In Anger,https://tabs.ultimate-guitar.com/tab/oasis/don...
1,Wish You Were Here,https://tabs.ultimate-guitar.com/tab/pink-floy...
2,Let It Be (ver 2),https://tabs.ultimate-guitar.com/tab/the-beatl...
3,Boulevard Of Broken Dreams (ver 3),https://tabs.ultimate-guitar.com/tab/green-day...
4,Boulevard Of Broken Dreams (ver 3),https://tabs.ultimate-guitar.com/tab/green-day...
5,Father And Son,https://tabs.ultimate-guitar.com/tab/cat-steve...
6,7 Years (ver 2),https://tabs.ultimate-guitar.com/tab/lukas-gra...
7,Simple Man (ver 3),https://tabs.ultimate-guitar.com/tab/lynyrd-sk...
8,Dream On (ver 2),https://tabs.ultimate-guitar.com/tab/aerosmith...
9,Breezeblocks (ver 2),https://tabs.ultimate-guitar.com/tab/alt-j/bre...
