In [1]:
import requests
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import pprint
import numpy as np
import pandas as pd
import time
import pickle

In [2]:
#source: https://stackoverflow.com/questions/1987694/how-to-print-the-full-numpy-array-without-truncation
#function to print out full numpy array instead of excerpt
def fullprint(*args, **kwargs):
    from pprint import pprint
    import numpy
    opt = numpy.get_printoptions()
    numpy.set_printoptions(threshold=numpy.inf)
    pprint(*args, **kwargs)
    numpy.set_printoptions(**opt)
    

#parse HTML text to clean up tabs and return array of tabs (get rid of all the spaces)
def parseTab(TAB):
    lines = [] #will hold all lines of the tab
    for line in TAB.children: #iterate through lines of tab
        #only parse lines with notes (ignores lyric lines and chord lines)
        if line.text[0] == 'e' or line.text[0] == 'B' or line.text[0] == 'G' or line.text[0] == 'D' or line.text[0] == 'A' or line.text[0] == 'E':
            #print(line.text)
            val = list(line.text)
            if(lines):
                val = val[0:len(lines[0])] #some lines have additional text at the end so removes that
            lines.append(val) #adds to the lines array
    tabs = np.array(lines) #creates numpy array out of the lines
    tabsTranspose = tabs.T #takes transpose of array so we can easily iterate through the columns by making them the rows
    #trans[tabsTranspose=='-']= np.NaN
    mask_array = [any(char.isdigit() for char in row) for row in tabsTranspose] #creates mask that finds the lines that have notes (will be used to filter out empty lines)
    #print(mask_array)
    array = tabsTranspose[mask_array].T #filters out all lines without notes to get array with all notes
    #print()
    return array
    #print(array)
    #[''.join(row) for row in array]
    
#input HTML beautiful soup and get array of tabs
def getTab(soup):
    body = soup.find(class_='js-page js-global-wrapper') #finds main body of website which contains the strum patterns
    print(body)
    patterns = body.find_all(class_='_2o1rM') #every tabs line is within a class called '_2o1rM'
    print('Patterns:',patterns)
    allTabs = [] #will hold each row of tabs
    allTabs= [parseTab(pattern) for pattern in patterns] #parses each row of tab and stores in allTabs
    print('AllTabs:',allTabs)
    combinedTab = allTabs[0] 
    for i in range(1,len(allTabs)): #combine all the rows of tabs into one long tab
        combinedTab = np.append(combinedTab,allTabs[i],axis = 1)
    return combinedTab

#input a song's ultimate guitar url to get a numpy array of the tabs of that song
async def getData(url): #function to parse for strum patterns at given url
    asession = AsyncHTMLSession() #creates Async object
    
    r = await asession.get(url ,timeout=30) #requests website from server and waits for response

    #print("Rendering JavaScript")
    site = await r.html.arender(timeout=30) #renders the javascript, NOTE: Currently the speed bottleneck as the javascript is taking too long to render

    soup = BeautifulSoup(r.html.html, features='lxml') #creates BeautifulSoup object which will parse the html
    #print(soup.prettify())
    tab = getTab(soup) #calls function that will parse tabs
  
    await r.session.close() #closes chromium process
    await asession.close() #close session
    return tab #returns array of tabs

In [3]:
url2 = 'https://tabs.ultimate-guitar.com/tab/the-beatles/here-comes-the-sun-tabs-201130'
url1 = 'https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488'
check = await getData(url2)  
fullprint(check.T) #print the transpose because the normal array would be too wide to print cleanly

<div class="js-page js-global-wrapper"><div class="_1Dqy7"></div><div class="_2Ms10"><div class="_1jLF0 _3URRI SiteWideBanner"><a href="https://www.ultimate-guitar.com/pro/?utm_source=UltimateGuitar&amp;utm_medium=TopBanner&amp;utm_campaign=PermanentBanner&amp;song=Here%20Comes%20The%20Sun&amp;artist=The%20Beatles&amp;utm_content" target="_blank"><div class="_2rTWY"><div class="_3hCtw _2RPTX _3Ydpc _3NuJB"><div class="A97aq"><span class="st_Jr">New Year Sale: <span class="_29x7d">Pro Access 80% OFF</span></span><div class="duF9V"><div class="_3E82k"><div>0</div><div>days</div></div><div class="knbpR">:</div><div class="_3E82k"><div>09</div><div>hrs</div></div><div class="knbpR">:</div><div class="_3E82k"><div>23</div><div>min</div></div><div class="knbpR">:</div><div class="_3E82k"><div>12</div><div>sec</div></div></div><button class="_3My1Y _3uYku _1yFTF vYOkd jumYi _2i99Q _2Kl2c" type="button"><span class="_16-Wy _3OIBi">GET SPECIAL OFFER</span></button></div></div></div></a><button 

In [4]:
#TESTING

asession = AsyncHTMLSession() #creates Async object
url2 = 'https://tabs.ultimate-guitar.com/tab/the-beatles/here-comes-the-sun-tabs-201130'
url1 = 'https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488'
r = await asession.get(url1 ,timeout=30) #requests website from server and waits for response

#print("Rendering JavaScript")
site = await r.html.arender(timeout=30) #renders the javascript, NOTE: Currently the speed bottleneck as the javascript is taking too long to render

soup = BeautifulSoup(r.html.html, features='lxml') #creates BeautifulSoup object which will parse the html
body = soup.find(class_='js-page js-global-wrapper') #finds main body of website which contains the strum patterns
patterns = body.find_all(class_='_2J-ci') #every tabs line is within a class called '_2J-ci'

print('Patterns length:',len(patterns))
#print(patterns[0].text)
allTabs = []
allTabs= [parseTab(pattern) for pattern in patterns]
print('Alltabs length:',len(allTabs))
x = allTabs[0]
for i in range(1,len(allTabs)):
    #print('1st:',allTabs[i-1])
    print(allTabs[i].shape)
    print(i)
    print('2nd:',allTabs[i],'\n')
    x = np.append(x,allTabs[i],axis = 1)
print(x.shape)
fullprint(x[:,0:10])
fullprint(x.T)
#print(x[:,0:10])
#np.append(array,array2,axis=1).shape



Patterns length: 0
Alltabs length: 0


IndexError: list index out of range

In [None]:
#TESTING

# for line in patterns[8].children:
#     print(line.text)
    
x = patterns[17].find_all(class_='_1zlI0')
for line in x:
    if line.text[0] == 'e' or line.text[0] == 'B' or line.text[0] == 'G' or line.text[0] == 'D' or line.text[0] == 'A' or line.text[0] == 'E':
        print(line.text)

for line in x:
    print(line.text)
x[2]

In [None]:
#TESTING

lines = []
for line in patterns[0].children:
    lines.append(list(line.text))
tabs = np.array(lines)
tabsTranspose = z.T
#trans[tabsTranspose=='-']= np.NaN
mask_array = [any(char.isdigit() for char in row) for row in tabsTranspose]
#print(mask_array)
array = tabsTranspose[mask_array].T
#print(array)
#[''.join(row) for row in array]

lines = []
for line in patterns[1].children:
    lines.append(list(line.text))
tabs = np.array(lines)
tabsTranspose = z.T
#trans[tabsTranspose=='-']= np.NaN
mask_array = [any(char.isdigit() for char in row) for row in tabsTranspose]
#print(mask_array)
array2 = tabsTranspose[mask_array].T
#print(array)
[''.join(row) for row in array]
np.append(array,array2,axis=1).shape


In [None]:
array = [[1,2,3,4,5],[6,7,8,9,10]]
print([[num > 4 for num in row] for row in array])
print([any(num >4 for num in row) for row in array])