In [20]:
import requests
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import pprint
import numpy as np
import pandas as pd
import time
import pickle
import re

In [29]:
#source: https://stackoverflow.com/questions/1987694/how-to-print-the-full-numpy-array-without-truncation
#function to print out full numpy array instead of excerpt
def fullprint(*args, **kwargs):
    from pprint import pprint
    import numpy
    opt = numpy.get_printoptions()
    numpy.set_printoptions(threshold=numpy.inf)
    pprint(*args, **kwargs)
    numpy.set_printoptions(**opt)
    

#parse HTML text to clean up tabs and return array of tabs (get rid of all the spaces)
def parseTab(TAB):
    lines = [] #will hold all lines of the tab
    for line in TAB.children: #iterate through lines of tab
        #only parse lines with notes (ignores lyric lines and chord lines)
        if line.text[0] == 'e' or line.text[0] == 'B' or line.text[0] == 'G' or line.text[0] == 'D' or line.text[0] == 'A' or line.text[0] == 'E':
            #print(line.text)
            val = list(line.text)
            if(lines):
                val = val[0:len(lines[0])] #some lines have additional text at the end so removes that
            lines.append(val) #adds to the lines array
    tabs = np.array(lines) #creates numpy array out of the lines
    tabsTranspose = tabs.T #takes transpose of array so we can easily iterate through the columns by making them the rows
    #trans[tabsTranspose=='-']= np.NaN
    mask_array = [any(char.isdigit() for char in row) for row in tabsTranspose] #creates mask that finds the lines that have notes (will be used to filter out empty lines)
    #print(mask_array)
    array = tabsTranspose[mask_array].T #filters out all lines without notes to get array with all notes
    #print()
    return array
    #print(array)
    #[''.join(row) for row in array]
    
#input HTML beautiful soup and get array of tabs
def getTab(soup):
    body = soup.find(class_='js-page js-global-wrapper') #finds main body of website which contains the strum patterns
    #print(body)
    patterns = body.find_all(class_='_2o1rM') #every tabs line is within a class called '_2o1rM'
    #print('Patterns:',patterns)
    allTabs = [] #will hold each row of tabs
    allTabs= [parseTab(pattern) for pattern in patterns] #parses each row of tab and stores in allTabs
    #print('AllTabs:',allTabs)
    combinedTab = allTabs[0] 
    for i in range(1,len(allTabs)): #combine all the rows of tabs into one long tab
        combinedTab = np.append(combinedTab,allTabs[i],axis = 1)
    return combinedTab


def getAdditionalInfo(soup):
    body = soup.find(class_='js-page js-global-wrapper') #finds main body of website which contains the strum patterns
    info = soup.find(class_='_2I_M-')
    capo = 0
    key = ''
    tuning = ''
    for line in info:
        if 'Capo' in line.text:
            #https://stackoverflow.com/questions/4289331/how-to-extract-numbers-from-a-string-in-python
            capo = [int(val) for val in re.findall(r'\d+', 'Capo: 7th')][0]
        if 'Key' in line.text:
            key = line.text.split()[1]
        if 'Tuning' in line.text:
            tuning = line.text.split(': ')[1]
    return capo,key,tuning


#input a song's ultimate guitar url to get a numpy array of the tabs of that song
async def getSongData(url): #function to parse for strum patterns at given url
    start_time = time.time()
    asession = AsyncHTMLSession() #creates Async object
    
    r = await asession.get(url ,timeout=30) #requests website from server and waits for response

    #print("Rendering JavaScript")
    site = await r.html.arender(timeout=30) #renders the javascript, NOTE: Currently the speed bottleneck as the javascript is taking too long to render
    
    print("Render Website --- %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    
    soup = BeautifulSoup(r.html.html, features='lxml') #creates BeautifulSoup object which will parse the html
    #print(soup.prettify())
    capo,key,tuning = getAdditionalInfo(soup)
    print('Capo:',capo,"Key:",key,"Tuning:",tuning)
    tab = getTab(soup) #calls function that will parse tabs
      
    print("Process Data --- %s seconds ---" % (time.time() - start_time))
    
    await r.session.close() #closes chromium process
    await asession.close() #close session
    return capo,key,tuning,tab #returns array of tabs

async def scrapeSongs(numSongs):
    asession = AsyncHTMLSession() #creates Async object
    page = 1 #starts on page 1 of songs website
    URL = 'https://www.ultimate-guitar.com/explore?type[]=Tabs&page='
    count = 0 #will hold # of songs checked
    data = []
    while count < numSongs and page <= 20: #each page holds 50 songs so sets limits on number of pages checked
        r = await asession.get(URL+str(page),timeout=30) #requests website from server and waits for response
        
        site = await r.html.arender(timeout=30) #renders javascript on page

        soup = BeautifulSoup(r.html.html, features='lxml') #creates BeautifulSoup object which will parse the html
        songs = soup.find_all(class_='_36xEX _3_qAd _17l1x') #the class that holds the url for each song
        for song in songs: #iterates through all songs on page
            try: #try except statements in case javascript rendering times out
                #print(song['href'])
                capo,key,tuning,tab = await getData(song['href']) #call getData using url of this song to get song data
                data.append((song.text,song['href'],key,tuning,capo,tab)) #add tuple containing song name, url, and strum patterns to list
            except:
                print('Rendering Timeout after',count) 
                
            count += 1 #increase count on # of songs scraped
            if count > numSongs: #if we've scraped enough songs, break and return
                break
            print('Scraping Song #',count)
        #await r.session.close() #closes chromium process for page
        page += 1 #after all songs scraped from page, increment page count to access next page of songs
    await r.session.close()
    await asession.close() #closes AnsyncHTMLSession
    return data

In [30]:
url2 = 'https://tabs.ultimate-guitar.com/tab/the-beatles/here-comes-the-sun-tabs-201130'
url1 = 'https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488'
check = await getData(url2)
print(check)
#fullprint(check.T) #print the transpose because the normal array would be too wide to print cleanly

Render Website --- 2.910392999649048 seconds ---
Capo: 7 Key: D Tuning: E A D G B E
Process Data --- 0.045372962951660156 seconds ---
[7, 'D', 'E A D G B E', array([['2', '-', '0', ..., '-', '-', '2'],
       ['3', '3', '-', ..., '-', '-', '3'],
       ['2', '-', '-', ..., '-', '-', '2'],
       ['0', '-', '-', ..., '-', '-', '0'],
       ['-', '-', '-', ..., '-', '2', '-'],
       ['-', '-', '-', ..., '3', '-', '-']], dtype='<U1')]


In [31]:
test = await scrapeSongs(10)
dataframe = pd.DataFrame(test)
dataframe

Render Website --- 3.080052137374878 seconds ---
Capo: 0 Key: Am Tuning: E A D G B E
Process Data --- 0.0586240291595459 seconds ---
Scraping Song # 1
Render Website --- 4.204690933227539 seconds ---
Capo: 0 Key: Em Tuning: E A D G B E
Process Data --- 0.04242706298828125 seconds ---
Scraping Song # 2
Render Website --- 2.6182219982147217 seconds ---
Capo: 0 Key: Gm Tuning: E A D G B E
Rendering Timeout after 2
Scraping Song # 3
Render Website --- 2.8985490798950195 seconds ---
Capo: 0 Key: Bm Tuning: E A D G B E
Process Data --- 0.06677412986755371 seconds ---
Scraping Song # 4
Render Website --- 2.696859836578369 seconds ---
Capo: 7 Key: D Tuning: E A D G B E
Process Data --- 0.04446291923522949 seconds ---
Scraping Song # 5
Render Website --- 2.6497890949249268 seconds ---
Capo: 0 Key: E Tuning: D G C F A D
Process Data --- 0.035333871841430664 seconds ---
Scraping Song # 6
Render Website --- 2.95793080329895 seconds ---
Capo: 0 Key: C Tuning: E A D G B E
Process Data --- 0.05065512

Unnamed: 0,0,1,2,3,4,5
0,Stairway To Heaven,https://tabs.ultimate-guitar.com/tab/led-zeppe...,Am,E A D G B E,0,"[[-, -, -, 5, 7, -, -, 7, 8, -, -, 8, 2, -, -,..."
1,Nothing Else Matters,https://tabs.ultimate-guitar.com/tab/metallica...,Em,E A D G B E,0,"[[-, -, -, 0, -, -, -, -, -, 0, -, -, -, -, -,..."
2,Hotel California (ver 2),https://tabs.ultimate-guitar.com/tab/eagles/ho...,Bm,E A D G B E,0,"[[-, -, -, -, -, -, 7, 1, 0, 1, 2, -, -, 9, -,..."
3,Here Comes The Sun (ver 3),https://tabs.ultimate-guitar.com/tab/the-beatl...,D,E A D G B E,7,"[[2, -, 0, 2, -, 2, 0, -, -, -, 0, -, -, -, -,..."
4,Come As You Are,https://tabs.ultimate-guitar.com/tab/nirvana/c...,E,D G C F A D,0,"[[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,..."
5,Wish You Were Here (ver 5),https://tabs.ultimate-guitar.com/tab/pink-floy...,C,E A D G B E,0,"[[-, -, -, -, 3, 3, 3, 3, 3, 3, -, -, -, 3, 3,..."
6,Star Shopping Intro,https://tabs.ultimate-guitar.com/tab/lil-peep/...,G,E A D G B E,0,"[[-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,..."


In [4]:
# #TESTING

# asession = AsyncHTMLSession() #creates Async object
# url2 = 'https://tabs.ultimate-guitar.com/tab/the-beatles/here-comes-the-sun-tabs-201130'
# url1 = 'https://tabs.ultimate-guitar.com/tab/led-zeppelin/stairway-to-heaven-tabs-9488'
# r = await asession.get(url1 ,timeout=30) #requests website from server and waits for response

# #print("Rendering JavaScript")
# site = await r.html.arender(timeout=30) #renders the javascript, NOTE: Currently the speed bottleneck as the javascript is taking too long to render

# soup = BeautifulSoup(r.html.html, features='lxml') #creates BeautifulSoup object which will parse the html
# body = soup.find(class_='js-page js-global-wrapper') #finds main body of website which contains the strum patterns
# patterns = body.find_all(class_='_2J-ci') #every tabs line is within a class called '_2J-ci'

# print('Patterns length:',len(patterns))
# #print(patterns[0].text)
# allTabs = []
# allTabs= [parseTab(pattern) for pattern in patterns]
# print('Alltabs length:',len(allTabs))
# x = allTabs[0]
# for i in range(1,len(allTabs)):
#     #print('1st:',allTabs[i-1])
#     print(allTabs[i].shape)
#     print(i)
#     print('2nd:',allTabs[i],'\n')
#     x = np.append(x,allTabs[i],axis = 1)
# print(x.shape)
# fullprint(x[:,0:10])
# fullprint(x.T)
# #print(x[:,0:10])
# #np.append(array,array2,axis=1).shape



Patterns length: 0
Alltabs length: 0


IndexError: list index out of range

In [None]:
# #TESTING

# # for line in patterns[8].children:
# #     print(line.text)
    
# x = patterns[17].find_all(class_='_1zlI0')
# for line in x:
#     if line.text[0] == 'e' or line.text[0] == 'B' or line.text[0] == 'G' or line.text[0] == 'D' or line.text[0] == 'A' or line.text[0] == 'E':
#         print(line.text)

# for line in x:
#     print(line.text)
# x[2]

In [None]:
# #TESTING

# lines = []
# for line in patterns[0].children:
#     lines.append(list(line.text))
# tabs = np.array(lines)
# tabsTranspose = z.T
# #trans[tabsTranspose=='-']= np.NaN
# mask_array = [any(char.isdigit() for char in row) for row in tabsTranspose]
# #print(mask_array)
# array = tabsTranspose[mask_array].T
# #print(array)
# #[''.join(row) for row in array]

# lines = []
# for line in patterns[1].children:
#     lines.append(list(line.text))
# tabs = np.array(lines)
# tabsTranspose = z.T
# #trans[tabsTranspose=='-']= np.NaN
# mask_array = [any(char.isdigit() for char in row) for row in tabsTranspose]
# #print(mask_array)
# array2 = tabsTranspose[mask_array].T
# #print(array)
# [''.join(row) for row in array]
# np.append(array,array2,axis=1).shape


In [19]:
#just learning about list comprehension and in line loops
# array = [[1,2,3,4,5],[6,7,8,9,10]]
# print([[num > 4 for num in row] for row in array])
# print([any(num >4 for num in row) for row in array])
string = 'testing string split function'
string.split()
import re
capo = [int(val) for val in re.findall(r'\d+', 'Capo: 7th')][0]
capo

7