In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [30]:
from typing import Any, Dict, Tuple

# TODO imrpove parsing logic to handle errors
def get_wiki_tabs(url) -> Tuple[str, Dict[str, Any]]:
    session = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36"
    }
    response = session.get(url, headers=headers)
    print(response.status_code)

    soup = BeautifulSoup(response.text, 'html.parser')
    js_store = soup.find("div", {"class": "js-store"})
    json_content = js_store.attrs.get("data-content")
    page_content = json.loads(json_content)

    page_data = page_content["store"]["page"]["data"] 
    tab_meta = page_data["tab"]
    tab_view = page_data["tab_view"]

    return tab_view['wiki_tab']['content'], tab_meta


In [24]:
url = "https://tabs.ultimate-guitar.com/tab/the-fray/heartbeat-chords-1158352"

tabs, metadata = get_wiki_tabs(url)

200


In [25]:
metadata["song_name"], metadata["artist_name"]

('Heartbeat', 'The Fray')

In [26]:
from dataclasses import dataclass, field
from typing import List

from music import Chord

@dataclass
class TabLine:
    chords: List[Chord] = field(default_factory=list)
    lyrics: str = ""

In [27]:
import re

def parse_tab_text(tab_text: str) -> List[TabLine]:
    tabs = []
    # Regex to match the entire block between [tab] and [/tab], as well as chords within
    tab_pattern = re.compile(r"\[tab\](.*?)\[/tab\]", re.DOTALL)
    chord_pattern = re.compile(r"\[ch\](.*?)\[/ch\]")

    # Find all tab blocks
    matches = tab_pattern.findall(tab_text)

    for match in matches:
        chords = [Chord(chord.strip()) for chord in chord_pattern.findall(match)]  # fmt: skip
        lyrics = chord_pattern.sub("", match).strip()

        # Append the tab line with extracted chords and lyrics
        tabs.append(TabLine(chords=chords, lyrics=lyrics))

    return tabs

In [28]:
parsed_tabs = parse_tab_text(tabs)

### Music Theory

In [31]:
from pathlib import Path


def sanitize_song_name(song_name: str) -> str:
    return (
        song_name.replace(" ", "_")
        .replace("-", "_")
        .replace("'", "")
        .replace('"', "")
        .strip()
    )


song_name = sanitize_song_name(metadata["song_name"])

output_file = Path("/Users/ishaan/Workspace/chords/output") / Path(
    song_name
).with_suffix(".txt")

output_lines = []

TRANSPOSE_SEMITONES = +0
for tab in parsed_tabs:
    transposed_chords = [chord.transpose(TRANSPOSE_SEMITONES) for chord in tab.chords]
    transposed_chord_names = [chord.name for chord in transposed_chords]
    # print(f"Chords: {transposed_chord_names}, Lyrics: '{tab.lyrics}'")
    # print(transposed_chord_names)
    # print(tab.lyrics)

    output_lines.append(str(transposed_chord_names))
    output_lines.append(tab.lyrics)

with open(output_file, "w") as f:
    f.write("\n".join(output_lines))
    print(f"Output written to {output_file}")

Output written to /Users/ishaan/Workspace/chords/output/Heartbeat.txt
