### Web Scraping 1: Get the top20 songs list of Coldplay on Spotify from a website called KWORB
Kworb is a music data analytics website that aggregates and showcases Spotify streaming charts and artist rankings. The platform offers insights into song popularity and trends across various regions globally.

In [115]:
import pandas as pd
import requests

# URL of kworb
url = "https://kworb.net/spotify/artist/4gzpq5DPGxSnKTe4SA8HAU_songs.html"
# request the content
r = requests.get(url)
# make sure the request is success
if r.status_code == 200:
    # analyze the tables on the website
    df_list = pd.read_html(r.text)
    #get the second table on the website
    df = df_list[1]
    #remove the repeated row...
    df = df.drop(13, axis=0)
    # save the table as CSV format
    df.to_csv("spotify_songs.csv", index=False)
    print(df.head(20))

                       Song Title     Streams      Daily
0      * Something Just Like This  2360429097  1024226.0
1                          Yellow  1994170156  1619221.0
2                    Viva La Vida  1817301965  1397710.0
3                   The Scientist  1730247039   809670.0
4                         Fix You  1331066416   726609.0
5             A Sky Full of Stars  1306202500   927520.0
6            Hymn for the Weekend  1269746647   634085.0
7                     My Universe  1165484041   599792.0
8                        Paradise  1162178312   653060.0
9         Adventure of a Lifetime   923446921   532144.0
10                         Clocks   793528827   576293.0
11                         Sparks   771267093  1062281.0
12                          Magic   669899679   157968.0
14                        Trouble   329969893   160220.0
15                   Higher Power   320956467   153495.0
16                    In My Place   300647715   193981.0
17               Christmas Ligh

### Web Scraping 2: Get the lyrics of top20 songs from the website Lyricsfreak

In [69]:
import requests
from bs4 import BeautifulSoup

# Create the name list of top20 songs
songs = df['Song Title'].head(20).tolist()

# The website we are gong to visit
base_url = 'https://www.lyricsfreak.com/search.php?q='

# Create request
with requests.Session() as s:
    for song in songs:
        # Construct a URL that conforms to the website's search format
        search_url = base_url + "+".join(song.split())

        # Scrape the URL of the lyrics page from the search results
        search_response = s.get(search_url)
        if search_response.ok:
            search_soup = BeautifulSoup(search_response.text, "html.parser")

            # Find the lyrics link
            search_result = search_soup.find('a', attrs={'class': 'song'})
            if search_result:
                lyrics_url = search_result.get('href')

                # request for the lyrics page
                lyrics_full_url = 'https://www.lyricsfreak.com' + lyrics_url
                lyrics_response = s.get(lyrics_full_url)
                if lyrics_response.ok:

                    # analyze lyrics pages
                    lyrics_soup = BeautifulSoup(lyrics_response.text, "html.parser")              

                    # identify the container that holds the lyrics based on the webpage structure
                    lyrics_div = lyrics_soup.find('div', id="content")
                    if lyrics_div:
                        lyrics = lyrics_div.get_text(strip=True)

                        # clean the title of song
                        safe_song_title = "".join([c for c in song if c.isalpha() or c.isdigit() or c==' ']).rstrip()

                        # save the lyrics as TXT format
                        with open(f"{safe_song_title}.txt", "w", encoding="utf-8") as file:
                            file.write(lyrics)
                    else:
                        print(f"Could not find the lyrics container for {song}.")
                else:
                    print(f"Failed to retrieve lyrics page for {song}")
            else:
                print(f"Could not find a lyrics link in search results for {song}.")
        else:
            print(f"Failed to retrieve search page for {song}")

###  Cleaning & Annotating 

#### 1. Install and initiate everything we are going to use

In [116]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [117]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [118]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [119]:
# initiate everything
import os
import re
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy

nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
spacy_nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/irontree/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/irontree/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### 2. Define the function for cleaning the texts

In [122]:
def clean_lyrics(text):
    # remove the content in []
    text = re.sub(r'\\\[.\*?\\]', '', text)
    # lowercasing
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization and remove stop words
    tokens = nltk.word_tokenize(text)
    # remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

#### 3. Define the function for getting POS and Lemmas

In [123]:
def get_pos_and_lemma(tokens):
    spacy_doc = spacy_nlp(' '.join(tokens))
    # get pos
    pos = [token.pos_ for token in spacy_doc]
    # get lemmas 
    lemmas = [token.lemma_ for token in spacy_doc]
    return pos, lemmas

#### 4. Read the text files and make them into structured dataset

In [125]:
file_data = []
# read the text
for filename in os.listdir('Lyrics Dataset of top20 Songs'):
    if filename.endswith('.txt'):
        with open(f'Lyrics Dataset of top20 Songs/{filename}', 'r', encoding='utf-8') as file:
            text = file.read()
            #clean the text
            cleaned_tokens = clean_lyrics(text)
            #get pos and lemmas of the text
            pos_tags, lemmas = get_pos_and_lemma(cleaned_tokens)
            #remove the extension name of file 
            file_data.append({
                'filename': filename.replace('.txt', ''),
                'original_text': text,
                'tokens': cleaned_tokens,
                'POS': pos_tags,
                'lemmas': lemmas
            })
file_data 

[{'filename': 'My Universe',
  'original_text': "I was lost in an endless spaceI couldn't find my wayI looked up and I saw your faceAnd heaven came crashing downYou are my universeIt only revolves around youYou're my Venus and MarsThe moon and the starsIn the velvet skyYou are my universeMy light and my darkness tooFor better or worse you are my curseMy universeCounting the stars in the dessert skySparkle's in your eyesMagic moments in the nightShooting comets all aroundYou are my universeIt only revolves around youYou're my Venus and MarsThe moon and the starsIn the velvet skyYou are my universeMy light and my darkness tooFor better or worseYou are my curseMy universeDance a little closer on the milky wayWrapped around eachother we drift awayWe are falling we are flyingDance a little closer on the milky wayWrapped around eachother we drift awayWe are falling we are flyingForeverYou are my universeMy light and my darkness tooFor better or worseYou are my curseMy universe",
  'tokens': 

#### 5. Make the file_data into datafram and export CSV

In [127]:
df = pd.DataFrame(file_data)

df.to_csv('clean_pos_lemmas_lyrics.csv', index=False)