## MuseScore Scraper

Tool: https://github.com/frankye8998/MusicalMusic  

[Reference 2](https://github.com/codeandproduce/music_research_dataset_midi/blob/9a5afe86f1f2d3fdbfa24de99122140e7a7cd60c/MuseScore/MuseScore%20Scraping%20Main.ipynb)

In [1]:
from lxml import html
import urllib
import requests
import shutil

In [2]:
from random import randint
from time import sleep

In [3]:
import json
from pathlib import Path
from tqdm import tqdm
import time

In [4]:
# links = []
# url = "https://musescore.com/hub/piano/solo-piano?sort=view_count"
# # url = "https://musescore.com/hub/piano/piano-duet?sort=view_count"
# page = requests.get(url)
# tree = html.fromstring(page.content)
# print(tree.xpath("//title/text()"))

In [5]:
# articles = tree.xpath("//article[@role='article']")

In [6]:
def get_data(a):
    if a.find("h2//a[@rel='bookmark']") is None:
        a = a.find("div[@class='col-right']")
    score = a.find("h2//a[@rel='bookmark']")
    link = score.attrib['href']
    title = score.text_content().strip()
    author = a.find("div[@class='user']//a").text_content()
    metadata = a.find("div[@class='meta']").text_content().split('•\n')
    parts, pages, duration, time, views = [m.strip() for m in metadata]
    return {
        'link': "https://musescore.com" + link,
        'score_id': Path(link).name,
        'title': title,
        'author': author,
        'parts': parts,
        'pages': pages,
        'duration': duration,
        'views': views
    }

## But that's actually just page one and there are pages 2,3,4,5... 100!
### So let's make a more general format:

In [7]:
# url_default = "https://musescore.com/hub/video_games/movie?sort=view_count&page=" # + the page number
# url_default = 'https://musescore.com/hub/piano/easy-piano?sort=view_count&page='
# url_default = "https://musescore.com/hub/piano/solo-piano?sort=view_count&page=" # + the page number
# url_default = "https://musescore.com/hub/piano/voice-piano?sort=view_count&page=" # + the page number
# url_default = "https://musescore.com/hub/piano?sort=view_count&page=" # + the page number
url_default = "https://musescore.com/groups/piano/sheetmusic?page="

In [8]:
page = requests.get(url_default+str(1)) # https://musescore.com/hub/piano?page=1,2,3,4,5,...,10
tree = html.fromstring(page.content)
articles = tree.xpath("//article[@role='article']")


In [9]:
json_file = Path('musescore_groups.json')

In [10]:
if json_file.exists():
    with open(json_file, 'r') as fp:
        links = json.load(fp)
        

In [11]:
def get_number(num_str):
    num_str = num_str.replace(',','').split(' ')[0]
    return int(num_str)

In [12]:
def is_popular_solo(d):
    try:
        views = get_number(d['views'])
        parts = get_number(d['parts'])
    except:
        return False
    return views > 100 and parts == 1

In [13]:
if not json_file.exists():
    links = []
    for page in tqdm(range(1,403), total=403):
        page = requests.get(url_default+str(page)) # https://musescore.com/hub/piano?page=1,2,3,4,5,...,10
        tree = html.fromstring(page.content)

        articles = tree.xpath("//article[@role='article']")
        metadata = [get_data(a) for a in articles]
        metadata = [d for d in metadata if is_popular_solo(d)]
        links.extend(metadata)

#         sleep(randint(1,4))
    with open(json_file, 'w') as fp:
        json.dump(links, fp)
print(links[:100])

[{'link': 'https://musescore.com/user/30284641/scores/5462906', 'score_id': '5462906', 'title': 'The Best Inspiration (50 Follower Special)', 'author': 'Bearsim', 'parts': '1 part', 'pages': '3 pages', 'duration': '02:59', 'views': '431 views'}, {'link': 'https://musescore.com/user/1089721/scores/5521557', 'score_id': '5521557', 'title': 'Sonate in G-moll "Ungestüm" (Impetuous)', 'author': 'Clemenzart ', 'parts': '1 part', 'pages': '8 pages', 'duration': '06:21', 'views': '216 views'}, {'link': 'https://musescore.com/user/1914361/scores/5391567', 'score_id': '5391567', 'title': 'Ice Caves - Original Piano Composition', 'author': 'AnonymousAlchemist', 'parts': '1 part', 'pages': '5 pages', 'duration': '03:53', 'views': '218 views'}, {'link': 'https://musescore.com/user/15026136/scores/5470220', 'score_id': '5470220', 'title': 'Background Music', 'author': 'CountryHomes', 'parts': '1 part', 'pages': '2 pages', 'duration': '02:05', 'views': '471 views'}, {'link': 'https://musescore.com/us

# Scrape links found

### Musical Music Lib

In [14]:

class MuseScoreException(Exception):
    pass

class InvalidFileExtension(MuseScoreException):
    pass

class InvalidScoreID(MuseScoreException):
    pass

class InvalidCredentials(MuseScoreException):
    pass

class InvalidSearchSort(MuseScoreException):
    pass

In [15]:
# import requests

# def urlretrieve(url: str, fn: Path, timeout=10):
#     with fn.open('wb') as f:
#         f.write(requests.get(url, allow_redirects=True, timeout=timeout).content)

In [16]:
import urllib
import urllib.request


import bs4
import requests

class MusicalMusic:
    """Musescore actions requiring an account."""

    def __init__(self, username, password):
        self.username = username
        url = "https://musescore.com/user/login"
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.text, "html.parser")
        csrf = soup.find("meta", {"name": "csrf-token"})["content"]
        url = "https://musescore.com/user/auth/login/process"
        cookies = {
            "mu_browser_uni": r.cookies['mu_browser_uni'],
            "_csrf": r.cookies["_csrf"]
        }
        data = {
          "username": username,
          "password": password,
          "_csrf": csrf,
          "op": "Log in"
        }
        try:
            mu_user = requests.post(url,
                                    data=data,
                                    cookies=cookies,
                                    allow_redirects=False,
                                    ).cookies["mu_user_new"]
        except KeyError as e:
            raise InvalidCredentials(
                "Please check your username and password!") from e

        mu_browser_uni = r.cookies['mu_browser_uni']

        self.mu_browser_uni = mu_browser_uni
        self.mu_user = mu_user

    def retrieve(self, id, format="pdf"):
        """Retrieves Musescore data in bytes"""

        if format not in ["mp3", "pdf", "mid", "mxl", "mscz"]:
            raise InvalidFileExtension("Must be mp3, pdf, mid, mxl, or mscz.")
        newlink = f"https://musescore.com/score/{id}/download/{format}"
        cookies = {"mu_browser_uni": self.mu_browser_uni,
                   "mu_user_new": self.mu_user}
        bytes = requests.get(newlink, cookies=cookies, verify=False)
        if bytes.status_code != 200:
            raise InvalidScoreID(str(bytes.status_code))
        return bytes.content

    def download(self, id, filename, format="mp3", proxy=None, timeout=10):
        if format not in ["mp3", "pdf", "mid", "mxl", "mscz"]:
            raise InvalidFileExtension("Must be mp3, pdf, mid, mxl, or mscz.")
        newlink = f"https://musescore.com/score/{id}/download/{format}"
        if proxy:
            proxy_support = urllib.request.ProxyHandler({'https' : proxy})
            opener = urllib.request.build_opener(proxy_support)
        else: opener = urllib.request.build_opener()
            
        cookieString = f"mu_browser_uni={self.mu_browser_uni};" \
                        f"mu_user_new={self.mu_user}"
        opener.addheaders = [("cookie", cookieString)]
        urllib.request.install_opener(opener)
        try:
            urllib.request.urlretrieve(newlink, filename)
        except urllib.error.HTTPError as e:
            raise e


### Trying proxy

https://codelike.pro/create-a-crawler-with-rotating-ip-proxy-in-python/

In [17]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import random

ua = UserAgent() # From here we generate a random user agent
proxies = [] # Will contain proxies [ip, port]

In [18]:
# Main function
  # Retrieve latest proxies
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')

soup = BeautifulSoup(proxies_doc, 'html.parser')
proxies_table = soup.find(id='proxylisttable')

# Save proxies in the array
for row in proxies_table.tbody.find_all('tr'):
    proxies.append({
      'ip':   row.find_all('td')[0].string,
      'port': row.find_all('td')[1].string
    })

### More proxies

https://github.com/constverum/ProxyBroker

In [29]:
import asyncio
from proxybroker import Broker

more_proxies = []

async def show(proxy_queue):
    while True:
        proxy = await proxy_queue.get()
        if proxy is None: break
        print('Found proxy: %s' % proxy)
        more_proxies.append({ 'ip': proxy.host, 'port': proxy.port })

proxy_queue = asyncio.Queue()
broker = Broker(proxy_queue)
countries = ['US', 'CA']
tasks = await asyncio.gather(
    broker.find(types=['HTTPS'], countries=countries, max_resp_time=4, limit=50),
    show(proxy_queue))

Found proxy: <Proxy US 0.24s [HTTPS] 68.183.143.161:8080>
Found proxy: <Proxy US 0.25s [HTTPS] 68.183.103.88:8080>
Found proxy: <Proxy US 0.10s [HTTPS] 159.89.236.26:8080>
Found proxy: <Proxy US 0.10s [HTTPS] 104.236.248.219:3128>
Found proxy: <Proxy US 0.27s [HTTPS] 68.183.121.154:8080>
Found proxy: <Proxy CA 0.13s [HTTPS] 199.201.122.89:3128>
Found proxy: <Proxy CA 0.14s [HTTPS] 144.217.69.149:1080>
Found proxy: <Proxy CA 0.16s [HTTPS] 159.203.14.149:4000>
Found proxy: <Proxy US 0.44s [HTTPS] 173.249.0.209:3128>
Found proxy: <Proxy US 0.39s [HTTPS] 68.183.35.48:8080>
Found proxy: <Proxy US 0.39s [HTTPS] 68.142.183.89:80>
Found proxy: <Proxy US 0.24s [HTTPS] 35.189.90.214:3128>
Found proxy: <Proxy US 0.54s [HTTPS] 68.183.180.184:8080>
Found proxy: <Proxy US 0.56s [HTTPS] 104.248.147.41:8080>
Found proxy: <Proxy US 0.55s [HTTPS] 67.23.64.98:53281>
Found proxy: <Proxy US 0.55s [HTTPS] 64.185.231.98:1080>
Found proxy: <Proxy US 0.55s [HTTPS] 139.180.209.153:80>
Found proxy: <Proxy US 0.8

### Random proxy

In [31]:
# all_proxies = proxies + more_proxies
all_proxies = more_proxies
# Retrieve a random index proxy (we need the index to delete it if not working)
def random_proxy():
    return random.randint(0, len(all_proxies) - 1)

proxy = None
# # Choose a random proxy
# proxy_index = random_proxy()
# proxy = all_proxies[proxy_index]
# proxy_url = proxy['ip'] + ':' + str(proxy['port']); proxy_url

## Actual scraping

In [21]:
invalid_ids = []

In [22]:
deleted_proxies = []

In [23]:
with open('accounts.json', 'r') as fp:
    accounts = json.load(fp)

instances = [MusicalMusic(username, password) for (username, password) in accounts]
instance = None

In [40]:
for idx,link in enumerate(tqdm(links, total=len(links))):
    score_id = link['score_id']
    out_file = f"data/{score_id}.mxl"
    if Path(out_file).exists() or score_id in invalid_ids: continue
    # Every 10 requests, generate a new proxy
    if idx % 10 == 0 or instance is None:
        instance_index = random.randint(0, len(instances) - 1)
        instance = instances[instance_index]
    try:
#         print('Downloading score id:', score_id)
        instance.download(score_id, out_file, format='mxl')
    except Exception as e:
        print('Could not download id:', score_id, e)
        instance = None
    








  0%|          | 0/5149 [00:00<?, ?it/s][A[A[A[A[A[A[A






 97%|█████████▋| 5004/5149 [00:00<00:00, 13438.01it/s][A[A[A[A[A[A[A

Could not download id: 65726 HTTP Error 403: Forbidden








 97%|█████████▋| 5004/5149 [00:17<00:00, 12268.10it/s][A[A[A[A[A[A

Could not download id: 63677 HTTP Error 403: Forbidden
Could not download id: 63652 HTTP Error 403: Forbidden









 97%|█████████▋| 5004/5149 [00:11<00:00, 13438.01it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5043/5149 [00:12<00:09, 11.04it/s]   [A[A[A[A[A[A[A






 98%|█████████▊| 5044/5149 [00:12<00:26,  3.99it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5045/5149 [00:13<00:53,  1.95it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5046/5149 [00:14<00:56,  1.83it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5047/5149 [00:14<00:51,  1.99it/s][A[A[A[A[A[A[A

Could not download id: 59325 HTTP Error 404: Not Found









 98%|█████████▊| 5048/5149 [00:15<01:00,  1.68it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5049/5149 [00:17<01:34,  1.05it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5050/5149 [00:18<01:24,  1.17it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5051/5149 [00:19<01:31,  1.07it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5052/5149 [00:19<01:21,  1.18it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5053/5149 [00:22<02:04,  1.29s/it][A[A[A[A[A[A[A






 98%|█████████▊| 5055/5149 [00:23<01:35,  1.02s/it][A[A[A[A[A[A[A






 98%|█████████▊| 5056/5149 [00:24<01:57,  1.26s/it][A[A[A[A[A[A[A






 98%|█████████▊| 5057/5149 [00:25<01:38,  1.07s/it][A[A[A[A[A[A[A






 98%|█████████▊| 5058/5149 [00:26<01:29,  1.02it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5059/5149 [00:27<01:23,  1.08it/s][A[A[A[A[A[A[A






 98%|█████████▊| 5060/5149 [00:28<01:46,  1.20s/it][A[A[A[A[A[A[A






 98%|█████████▊| 5061/5149 [00:30<01:45,  1.1

Could not download id: 49110 HTTP Error 404: Not Found









 99%|█████████▉| 5114/5149 [01:13<00:25,  1.40it/s][A[A[A[A[A[A[A






 99%|█████████▉| 5115/5149 [01:14<00:25,  1.35it/s][A[A[A[A[A[A[A






 99%|█████████▉| 5116/5149 [01:15<00:28,  1.17it/s][A[A[A[A[A[A[A






 99%|█████████▉| 5117/5149 [01:16<00:26,  1.19it/s][A[A[A[A[A[A[A






 99%|█████████▉| 5118/5149 [01:18<00:33,  1.08s/it][A[A[A[A[A[A[A






 99%|█████████▉| 5119/5149 [01:20<00:38,  1.29s/it][A[A[A[A[A[A[A






 99%|█████████▉| 5120/5149 [01:21<00:40,  1.39s/it][A[A[A[A[A[A[A






 99%|█████████▉| 5121/5149 [01:22<00:32,  1.16s/it][A[A[A[A[A[A[A






 99%|█████████▉| 5122/5149 [01:23<00:26,  1.00it/s][A[A[A[A[A[A[A






 99%|█████████▉| 5123/5149 [01:24<00:31,  1.20s/it][A[A[A[A[A[A[A






100%|█████████▉| 5124/5149 [01:26<00:35,  1.43s/it][A[A[A[A[A[A[A






100%|█████████▉| 5125/5149 [01:27<00:28,  1.18s/it][A[A[A[A[A[A[A






100%|█████████▉| 5126/5149 [01:27<00:23,  1.0

In [24]:
len(all_proxies)

50

In [25]:
import time

In [38]:
for idx,link in enumerate(tqdm(links, total=len(links))):
    score_id = link['score_id']
    out_file = f"data/{score_id}.mxl"
    if Path(out_file).exists() or score_id in invalid_ids: continue
    # Every 10 requests, generate a new proxy
#     if idx % 10 == 0 or proxy is None or instance is None:
    proxy_index = random.randint(0, len(all_proxies) - 1)
    proxy = all_proxies[proxy_index]
    proxy_url = proxy['ip'] + ':' + str(proxy['port']); proxy_url
    instance_index = random.randint(0, len(instances) - 1)
    instance = instances[instance_index]
    try:
        print('Downloading score id:', score_id)
        start = time.time()
        instance.download(score_id, out_file, format='mxl', proxy=proxy_url)
        end = time.time()
        if (end - start) > 10: raise Exception('Took too long to download')
    except Exception as e:
        print('Could not download id:', score_id)
        print('Error:', e)
        invalid_ids.append(score_id)
        deleted_proxies.append(all_proxies[proxy_index])
        del all_proxies[proxy_index]
        print('Proxy ' + proxy['ip'] + ':' + str(proxy['port']) + ' deleted.')
        print(instance.username)
        proxy = None
        instance = None
    







  0%|          | 0/5149 [00:00<?, ?it/s][A[A[A[A[A[A

Downloading score id: 79260








 96%|█████████▋| 4960/5149 [00:00<00:00, 8420.46it/s][A[A[A[A[A[A

Could not download id: 79260
Error: HTTP Error 403: Forbidden
Proxy 68.183.121.154:8080 deleted.
cwkeam
Downloading score id: 68966
Could not download id: 68966
Error: HTTP Error 403: Forbidden
Proxy 54.84.154.208:3128 deleted.
tulebo
Downloading score id: 68939
Could not download id: 68939
Error: HTTP Error 404: Not Found
Proxy 167.99.52.107:8888 deleted.
kate.zuo
Downloading score id: 66308
Could not download id: 66308
Error: HTTP Error 404: Not Found
Proxy 35.245.208.185:3128 deleted.
jurassictech
Downloading score id: 65726








[A[A[A[A[A[A

KeyboardInterrupt: 