### Libraries:

In [1]:
import os
import re
import requests
from bs4 import BeautifulSoup
import time
import random

### Setting up the directory:

In [2]:
# Ensure this directory exists or create it before running the script
project_directory = "/Users/aryasmc/My_USD/Year_2024/Summer_2024/Applied Text Mining (ADS-509-01)"
os.chdir(project_directory)

# Create a lyrics directory if it doesn't exist
lyrics_directory = os.path.join(project_directory, 'lyrics')
if not os.path.exists(lyrics_directory):
    os.makedirs(lyrics_directory)

### Lyrics Scraping:

In [3]:
def scrape_artists_lyrics():
    base_url = "https://www.azlyrics.com"
    artists = {
        'Robyn': 'r/robyn.html',
        'Cher': 'c/cher.html'
    }

    def get_song_links(artist_url):
        time.sleep(5 + 10 * random.random())
        response = requests.get(artist_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        return [base_url + link.get('href') for link in soup.find_all('a', href=True)
                if '/lyrics/' in link.get('href') and link.get('href').startswith('/lyrics/')][:25]

    def scrape_lyrics(song_url):
        time.sleep(5 + 10 * random.random())
        response = requests.get(song_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        lyrics_div = soup.find('div', class_=False, id=False)
        return lyrics_div.get_text(strip=True) if lyrics_div else ""

    for artist, extension in artists.items():
        artist_url = f"{base_url}/{extension}"
        song_links = get_song_links(artist_url)
        artist_dir = os.path.join(lyrics_directory, artist)
        os.makedirs(artist_dir, exist_ok=True)
        for link in song_links:
            lyrics = scrape_lyrics(link)
            song_name = link.split('/')[-1].replace('.html', '.txt')
            file_path = os.path.join(artist_dir, song_name)
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(lyrics)

# Call the scraping function
scrape_artists_lyrics()


### Evaluation Code:

In [6]:
def evaluate_lyrics():
    artist_folders = [f for f in os.listdir(lyrics_directory) if os.path.isdir(os.path.join(lyrics_directory, f))]

    def words(text):
        return re.findall(r'\w+', text.lower())

    for artist in artist_folders:
        artist_path = os.path.join(lyrics_directory, artist)
        files = [f for f in os.listdir(artist_path) if f.endswith('.txt')]
        print(f"For {artist} we have {len(files)} files.")
        artist_words = []
        for file in files:
            with open(os.path.join(artist_path, file), 'r', encoding='utf-8') as f:
                artist_words.extend(words(f.read()))
        unique_words = len(set(artist_words))
        print(f"For {artist} we have roughly {len(artist_words)} words, {unique_words} are unique.")

# Call the evaluation function
evaluate_lyrics()


For Robyn we have 25 files.
For Robyn we have roughly 5381 words, 1134 are unique.
For Cher we have 25 files.
For Cher we have roughly 4071 words, 1165 are unique.
