# ADS 509 Module 1: APIs and Web Scraping


# Importing Libraries
These imports are used for file management, HTTP requests, HTML parsing, 
and safe scraping practices.



In [None]:
import datetime
import re
import requests
import time   # <--- this one is required
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random


In [2]:
# Import Statements
import shutil
from pathlib import Path
from urllib.parse import urljoin
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}


# Artist Selection
We selected two artists (The Who and Rush) with at least 20 available songs each.


In [3]:
# Scrape Lyrics
artists = {
    "the_who": "https://www.azlyrics.com/w/who.html",
    "rush":    "https://www.azlyrics.com/r/rush.html"
}


## Part 1: Robots.txt
Q: Take a look at the `robots.txt` page on www.azlyrics.com. Is the scraping we are about to do allowed?  
A: The `robots.txt` file does not explicitly disallow scraping lyrics pages. Out of caution, 
I implemented polite rate limiting (5–15s delay) and a custom User-Agent to minimize load.


## Part 1: Harvesting Lyrics Links
This code requests each artist’s main page, extracts all song links, 
and stores them in `lyrics_pages`. We also deduplicate the links.


In [4]:
# Link Repo
lyrics_pages = defaultdict(list)
url_stub = "https://www.azlyrics.com"

for artist, artist_page in artists.items():
    r = requests.get(artist_page, headers=HEADERS, timeout=30)   # <- add headers+timeout
    time.sleep(5 + 10*random.random())
    soup = BeautifulSoup(r.text, "html.parser")

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("../lyrics/") or href.startswith("/lyrics/"):
            full_link = requests.compat.urljoin(url_stub + "/", href.replace("..", ""))
            lyrics_pages[artist].append(full_link)

    lyrics_pages[artist] = list(set(lyrics_pages[artist]))        # <- dedupe here
    print(f"{artist}: {len(lyrics_pages[artist])} lyric links found")



the_who: 200 lyric links found
rush: 171 lyric links found


### Checking Minimum Song Requirement
This ensures each artist has at least 20 songs before scraping.


In [5]:
# Enough Songs
for artist, lp in lyrics_pages.items():
    assert len(set(lp)) >= 20


In [6]:
# Scrape Time
for artist, links in lyrics_pages.items() : 
    print(f"For {artist} we have {len(links)}.")
    print(f"The full pull will take for this artist will take {round(len(links)*10/3600,2)} hours.")

For the_who we have 200.
The full pull will take for this artist will take 0.56 hours.
For rush we have 171.
The full pull will take for this artist will take 0.47 hours.


In [7]:
def generate_filename_from_link(link):
    if not link:
        return None
    name = link
    name = name.replace("https://", "").replace("http://", "")
    name = name.replace(".html", "")
    name = name.replace("/lyrics/", "")
    name = name.replace("://", "_").replace(".", "_").replace("/", "_")
    return name + ".txt"



In [8]:
# Lyrics Folder
if os.path.isdir("lyrics") : 
    shutil.rmtree("lyrics/")

os.mkdir("lyrics")

## Part 2: Scraping Lyrics
This loop visits each lyrics page, extracts the title and lyrics, 
and saves them to `lyrics/<artist>/<filename>.txt`. 
Each file is written with the format:  
*First line = title*  
*Second line = blank*  
*Remaining lines = lyrics*


In [9]:
# Scraping
start = time.time()
total_pages = 0

for artist, links in lyrics_pages.items():
    artist_folder = os.path.join("lyrics", artist)
    os.makedirs(artist_folder, exist_ok=True)
    print(f"Scraping {artist}... ({len(links)} links found)")

    saved = 0
    for link in links:
        try:
            # 1) request page (use headers + timeout; sleep after)
            r = requests.get(link, headers=HEADERS, timeout=30)
            time.sleep(5 + 10*random.random())
            r.raise_for_status()

            # 2) parse title + lyrics
            soup = BeautifulSoup(r.text, "html.parser")

            # Title: bold in center column; fallback to <title>
            center = soup.select_one("div.col-xs-12.col-lg-8.text-center")
            title = (center.select_one("b").get_text(strip=True).strip('"')
                     if center and center.select_one("b")
                     else (soup.title.get_text(strip=True) if soup.title else "Unknown Title"))

            # Lyrics: first child <div> with no attributes inside center column
            lyrics = ""
            if center:
                for child in center.find_all("div", recursive=False):
                    if not child.attrs:
                        lyrics = child.get_text("\n", strip=True)
                        break

            # Fallback: any no-attr div
            if not lyrics:
                cand = soup.find("div", attrs={})
                lyrics = cand.get_text("\n", strip=True) if cand else ""

            if not lyrics:
                continue  # skip blanks/blocked pages

            # 3) write to file
            filename = generate_filename_from_link(link)
            with open(os.path.join(artist_folder, filename), "w", encoding="utf-8") as f:
                f.write(f"{title}\n\n{lyrics}")

            saved += 1
            total_pages += 1

            # keep minimum requirement; remove this to save ALL songs
            if saved >= 20:
                break

        except Exception as e:
            print(f"  Skipped {link}: {e}")

    print(f"Saved {saved} files for {artist}")

print(f"\nScraping complete: {total_pages} files written.")
print(f"Total runtime: {round((time.time()-start)/60, 1)} minutes.")



Scraping the_who... (200 links found)
Saved 20 files for the_who
Scraping rush... (171 links found)
Saved 20 files for rush

Scraping complete: 40 files written.
Total runtime: 7.0 minutes.


In [10]:
# Total Run Time
print(f"Total run time was {round((time.time() - start)/3600,2)} hours.")

Total run time was 0.12 hours.


# Evaulation
This section summarizes how many files were created per artist 
and counts the total and unique words in their lyrics.


In [11]:
# Evaulation 
def words(text): 
    return re.findall(r'\w+', text.lower())

In [12]:
# Check Lyrics 
artist_folders = os.listdir("lyrics/")
artist_folders = [f for f in artist_folders if os.path.isdir("lyrics/" + f)]

for artist in artist_folders : 
    artist_files = os.listdir("lyrics/" + artist)
    artist_files = [f for f in artist_files if 'txt' in f or 'csv' in f or 'tsv' in f]

    print(f"For {artist} we have {len(artist_files)} files.")

    artist_words = []

    for f_name in artist_files : 
        with open("lyrics/" + artist + "/" + f_name) as infile : 
            artist_words.extend(words(infile.read()))

            
    print(f"For {artist} we have roughly {len(artist_words)} words, {len(set(artist_words))} are unique.")

For rush we have 20 files.
For rush we have roughly 5164 words, 1112 are unique.
For the_who we have 20 files.
For the_who we have roughly 4995 words, 914 are unique.


## Conclusion
Both artists had sufficient songs, the scraper saved 20 lyrics files per artist, 
and the evaluation confirmed the output contained thousands of words 
with high vocabulary diversity.
