# ADS 509 Module 1: APIs and Web Scraping

This notebook has two parts. In the first part, you will scrape lyrics from AZLyrics.com. In the second part, you'll run code that verifies the completeness of your data pull. 

For this assignment you have chosen two musical artists who have at least 20 songs with lyrics on AZLyrics.com. We start with pulling some information and analyzing them.


In [11]:
# Import necessary libraries
import os
import re
import time
import random
import shutil
import requests
from bs4 import BeautifulSoup
from collections import defaultdict, Counter


In [15]:
# Import necessary libraries
import os
import re
import time
import random
import shutil
import requests
from bs4 import BeautifulSoup
from collections import defaultdict, Counter

# Define the artists to scrape
artists = {
    'cory_asbury': "https://www.azlyrics.com/c/coryasbury.html",
    'hillsong_united': "https://www.azlyrics.com/h/hillsongunited.html"
}

# Set up a dictionary to hold our links
lyrics_pages = defaultdict(list)

# Scrape the artist pages to get links to individual song lyrics
print("Collecting song links for each artist...")
for artist, artist_page in artists.items():
    print(f"Processing {artist}...")
    
    # Request the page and sleep
    r = requests.get(artist_page)
    time.sleep(5 + 5*random.random())  # Be nice to the server
    
    # Extract the links to lyrics pages
    soup = BeautifulSoup(r.text, 'html.parser')
    album_list = soup.find('div', {'class': 'album-list'})
    
    if album_list:
        # Get all links to song lyrics
        links = album_list.find_all('a', href=True)
        for link in links:
            if '/lyrics/' in link['href']:
                # Add the full URL to our dictionary
                lyrics_pages[artist].append('https://www.azlyrics.com' + link['href'])
    
    print(f"Found {len(lyrics_pages[artist])} songs for {artist}")

# Verify we have enough songs for each artist
for artist, lp in lyrics_pages.items():
    assert(len(set(lp)) > 20)
    print(f"For {artist} we have {len(lp)} songs.")
    print(f"Estimated time: {round(len(lp)*10/60, 2)} minutes")
    
# Function to generate filenames from links
def generate_filename_from_link(link):
    if not link:
        return None
    
    # Drop the http or https and the html
    name = link.replace("https://", "").replace("http://", "")
    name = name.replace(".html", "")
    name = name.replace("www.azlyrics.com/lyrics/", "")
    
    # Replace special characters with underscore
    name = name.replace(".", "_").replace("/", "_")
    
    # Add .txt extension
    name = name + ".txt"
    
    return name

# Create the lyrics folder
print("\nCreating lyrics folder...")
if os.path.isdir("lyrics"):
    shutil.rmtree("lyrics/")
os.mkdir("lyrics")

# Scrape the lyrics
print("\nScraping lyrics...")
start = time.time()
total_pages = 0

for artist in lyrics_pages:
    # Create a subfolder for the artist
    artist_dir = os.path.join("lyrics", artist)
    os.mkdir(artist_dir)
    
    print(f"\nStarting to scrape lyrics for {artist}...")
    
    # Limit to 25 songs per artist to save time
    for page_url in lyrics_pages[artist][:25]:
        try:
            # Request the lyrics page
            response = requests.get(page_url)
            time.sleep(5 + 5*random.random())  # Be nice to the server
            
            # Extract the title and lyrics
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find the title
            title_div = soup.find('div', {'class': 'ringtone'})
            title = "Unknown Title"
            if title_div and title_div.find_previous('b'):
                title = title_div.find_previous('b').text.strip()
            
            # Find the lyrics
            lyrics_div = soup.find('div', {'class': None, 'id': None}, 
                                  attrs={'style': None})
            if lyrics_div:
                lyrics = lyrics_div.text.strip()
                
                # Write out the title and lyrics
                filename = generate_filename_from_link(page_url)
                with open(os.path.join(artist_dir, filename), 'w', encoding='utf-8') as f:
                    f.write(title + '\n\n' + lyrics)
                
                total_pages += 1
                print(f"Saved lyrics for {artist} - {title}")
        except Exception as e:
            print(f"Error processing {page_url}: {e}")

print(f"\nTotal run time: {round((time.time() - start)/60, 2)} minutes")
print(f"Total pages scraped: {total_pages}")

# Function to extract words from text
def words(text): 
    return re.findall(r'\w+', text.lower())

# Analyze the lyrics
print("\nAnalyzing lyrics...")
artist_folders = os.listdir("lyrics/")
artist_folders = [f for f in artist_folders if os.path.isdir("lyrics/" + f)]

# Create a dictionary to store word counts for each artist
artist_word_counts = {}

for artist in artist_folders:
    artist_files = os.listdir("lyrics/" + artist)
    artist_files = [f for f in artist_files if 'txt' in f or 'csv' in f or 'tsv' in f]

    print(f"\nFor {artist} we have {len(artist_files)} files.")

    artist_words = []
    for f_name in artist_files:
        with open("lyrics/" + artist + "/" + f_name, encoding='utf-8') as infile:
            artist_words.extend(words(infile.read()))
            
    print(f"For {artist} we have roughly {len(artist_words)} words, {len(set(artist_words))} are unique.")
    
    # Store word counts for this artist
    artist_word_counts[artist] = Counter(artist_words)
    
    # Print the most common words
    print(f"Most common words for {artist}:")
    print("-" * 30)
    print(f"{'Word':<15} | {'Count':<10}")
    print("-" * 30)
    
    for word, count in [(w, c) for w, c in artist_word_counts[artist].most_common(50) if len(w) > 2][:10]:
        print(f"{word:<15} | {count:<10}")

# Try to create visualizations if matplotlib is available
try:
    import matplotlib.pyplot as plt
    
    print("\nCreating visualizations...")
    
    # Plot the most common words for each artist
    plt.figure(figsize=(15, 10))
    
    for i, (artist, word_counts) in enumerate(artist_word_counts.items()):
        # Get the 10 most common words with length > 2
        common_words = [(word, count) for word, count in word_counts.most_common(50) if len(word) > 2][:10]
        words_list, counts = zip(*common_words)
        
        plt.subplot(2, 2, i+1)
        plt.barh(words_list, counts)
        plt.title(f"Most Common Words for {artist}")
        plt.xlabel("Count")
        plt.tight_layout()
    
    plt.savefig("artist_word_comparison.png")
    plt.show()
    print("Visualization saved as 'artist_word_comparison.png'")
    
except Exception as e:
    print(f"\nCould not create visualizations: {e}")
    print("Continuing with text-based analysis only.")

# Compare word usage between artists
print("\nComparing word usage between artists:")
print("=" * 50)

# Find common words across all artists
all_words = set()
for word_counts in artist_word_counts.values():
    all_words.update(word for word, _ in word_counts.most_common(100) if len(word) > 2)

# Select some interesting words to compare
comparison_words = list(all_words)[:20]  # Just use the first 20 words

# Print a comparison table
print(f"{'Word':<15}", end="")
for artist in artist_folders:
    print(f"| {artist:<15}", end="")
print()
print("-" * (15 + 17 * len(artist_folders)))

for word in comparison_words:
    print(f"{word:<15}", end="")
    for artist in artist_folders:
        count = artist_word_counts[artist].get(word, 0)
        print(f"| {count:<15}", end="")
    print()

Collecting song links for each artist...
Processing cory_asbury...
Found 0 songs for cory_asbury
Processing hillsong_united...
Found 0 songs for hillsong_united


AssertionError: 