In [1]:
# Library/module imports
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
import pandas as pd

In [2]:
## Pretend to be a browser (https://stackoverflow.com/questions/43590153/http-error-403-forbidden-when-reading-html/43590290#43590290):
HEADER = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

def acharts(year, week, url_base):
    url = url_base + year + '/' + week
    read_pg = requests.get(url, headers=HEADER)
    
    soup = BeautifulSoup(read_pg.text, "html.parser")
    
    return song_scrape(year, week, soup)

In [3]:
import unidecode

# Function to remove special characters
# and store it in another variable
def removeSpecialCharacter(s):
    t = ""
    for i in s:
        if(i.isalpha()):
            t+=i

    return t

def strip_string(s):
    # force lower:
    s = s.lower()
    # unaccent string:
    s = unidecode.unidecode(s)
    # remove anything that isn't a letter and return
    return removeSpecialCharacter(s)

def generate_songindex(artists, song):
    first_artist = artists.split('and')[0]
    first_artist = strip_string(first_artist)
    song = strip_string(song)
    
    return first_artist + "/" + song

In [4]:
def song_scrape(year, week, soup):
    s = soup.findAll("table")[0]
    top100 = []

    for tr in s.find("tbody").findAll("tr"):
        # tr = s.find("tbody").findAll("tr")[2]

        rank = tr.find('span', {"itemprop": "position"}).text
        rankPrev = tr.find('span', {'class':'Sub subStatsPrev'}).text
        rankPrev = rankPrev.replace("\n","").replace(" ","").strip('(').strip(')')
        title = tr.find('span', {'itemprop':'name'}).text

        artist = ""
        first = True
        for artists in tr.findAll('span', {'itemprop':'byArtist'}):
            if first:
                artist += artists.find('span', {'itemprop':'name'}).text
                first = False
            else:
                artist += " and " + artists.find('span', {'itemprop':'name'}).text
        
        songindex = generate_songindex(artist, title)
        top100.append((year, week, rank, rankPrev, title, artist, songindex))
    return top100

In [5]:
## User defines year & week to search:
year = '2020'
week = '30'
url_base = 'https://acharts.co/canada_singles_top_100/'
top100 = acharts(year, week, url_base)

## part 2: lyrics!
Using: https://www.azlyrics.com/

Format song to be: primary artist / title (nospaces, brackets, accents)

NOTE, an alternative could be https://github.com/jasonqng/genius-lyrics-search

In [9]:
def azlyrics(songindex):
    lyric_url = "https://www.azlyrics.com/lyrics/" + songindex + ".html"
    
    # read page:
    read_pg = requests.get(lyric_url, headers=HEADER)
    soup = BeautifulSoup(read_pg.text, "html.parser")

    return lyric_url, soup

def lyric_scrape(soup):
    s = soup.find('div', {'class': 'col-xs-12 col-lg-8 text-center'})
    # If lyrics are found, they'll be in the 6th div:
    if s:
        lyr = s.findAll('div')
        lyrics = lyr[5].text
    else:
        lyrics = '!1'

    return lyrics

In [8]:
url, soup = azlyrics(top100[0][-1])
lyrics = lyric_scrape(soup)
print(lyrics)



I don't want a lot for Christmas
There is just one thing I need
I don't care about the presents
Underneath the Christmas tree

I just want you for my own
More than you could ever know
Make my wish come true
All I want for Christmas is you, yeah

I don't want a lot for Christmas
There is just one thing I need
And I don't care about the presents
Underneath the Christmas tree

I don't need to hang my stocking
There upon the fireplace
Santa Claus won't make me happy
With a toy on Christmas Day

I just want you for my own
More than you could ever know
Make my wish come true
All I want for Christmas is you
You, baby

Oh, I won't ask for much this Christmas
I won't even wish for snow
And I'm just gonna keep on waiting
Underneath the mistletoe

I won't make a list and send it
To the North Pole for Saint Nick
I won't even stay awake to
Hear those magic reindeer click

'Cause I just want you here tonight
Holding on to me so tight
What more can I do?
Baby, all I want for Christmas is you
You, b

In [38]:
## Aside, save 100 songs for sentiment analysis comparison:
import time
from random import random

columns = ['song_index', 'lyrics']

lyrics = []

for song in top100:
    time.sleep(5 * (1 + random()))
    _, soup = azlyrics(song[-1])
    lyrics.append((song[-1], lyric_scrape(soup)))

In [39]:
lyrics_df = pd.DataFrame(lyrics, columns=columns)

In [40]:
import sqlite3
con = sqlite3.connect("test_data/sentiment_compare.db")

cur = con.cursor()
cur.execute(f"""CREATE TABLE IF NOT EXISTS sentiment_compare (
            song_index text,
            lyrics text
);""")
con.commit()

lyrics_df.to_sql(f"sentiment_compare", con, if_exists="replace", index=False)

100

In [41]:
len(list(cur.execute(f"select * from sentiment_compare")))

100

### Part 3:
Sentiment analysis

### Part 4:
Loop through all songs for lyrics.

In [None]:
sentiment

## Part 4:
Create a datastore for the information via SQLite

In [9]:
columns = ['year', 'week', 'position', 'prev_position', 'song', 'artist', 'song_index']
chart_df = pd.DataFrame(top100, columns=columns)

In [20]:
chart_df.head(2)

Unnamed: 0,year,week,position,prev_position,song,artist,song_index
0,2020,1,1,2,All I Want For Christmas Is You,Mariah Carey,mariahcarey/alliwantforchristmasisyou
1,2020,1,2,1,Dance Monkey,Tones and I,tones/dancemonkey


In [18]:
## Clean up data (get types correct, and prev_position "new" etc.)
chart_df.loc[chart_df['prev_position'] == 'new', 'prev_position'] = 0
chart_df.loc[chart_df['prev_position'] == 're-entry', 'prev_position'] = 0
chart_df = chart_df.astype({'position': 'int32', 'prev_position': 'int32'})

In [41]:
# Name region:
region = 'canada'

In [40]:
## Next up, need to put that in SQLite db
import sqlite3
con = sqlite3.connect("test.db")

cur = con.cursor()
cur.execute(f"""CREATE TABLE IF NOT EXISTS {region}_chart (
            year text,
            week text,
            position integer,
            prev_position integer,
            song text,
            artist text,
            song_index text
);""")
con.commit()
cur.execute(f"""CREATE TABLE IF NOT EXISTS temp_{region}_chart (
            year text,
            week text,
            position integer,
            prev_position integer,
            song text,
            artist text,
            song_index text
);""")
con.commit()

In [44]:
chart_df.to_sql(f"temp_{region}_chart", con, if_exists="replace", index=False)
cur.execute(f"""
    INSERT INTO {region}_chart
    SELECT * FROM temp_{region}_chart
""")
cur.execute(f"DROP TABLE temp_{region}_chart")
con.commit()

In [47]:
len(list(cur.execute(f"select * from {region}_chart order by position desc")))

200

In [None]:
## Pull Lyrics and create new db for them


## Part 4:
Loop through content!

### Appendix 3

In [134]:
## following along here: https://docs.python.org/3/library/sqlite3.html
import sqlite3
con = sqlite3.connect("tutorial.db")

names = ['John', 'Mike', 'Jane', 'Bella']
grades = [90, 95, 92, 98]

In [135]:
# Create a cursor
# https://towardsdatascience.com/from-sqlite-to-pandas-7-essential-operations-you-need-to-know-c7a5dd71f232
cur = con.cursor()

# Create the table named transcript
cur.execute("CREATE TABLE transcript (name text, grade integer);")

# Insert the records
cur.executemany("INSERT into transcript values (?, ?)", zip(names, grades))

# Commit all the transactions
con.commit()

In [136]:
cur.execute("select * from transcript order by grade desc")
list(cur)

[('Bella', 98), ('Mike', 95), ('Jane', 92), ('John', 90)]

In [137]:
_ = cur.execute("select * from transcript order by grade desc")
cur.fetchall()

[('Bella', 98), ('Mike', 95), ('Jane', 92), ('John', 90)]

In [138]:
# update:
cur.execute("update transcript set grade = 100 where name = 'John'")
# delete
#_ = cur.execute("delete from transcript where name='John'")

<sqlite3.Cursor at 0x7fd5143ee3c0>

In [139]:
# read to pd
import pandas as pd
df = pd.read_sql("select * from transcript", con)
df

Unnamed: 0,name,grade
0,John,100
1,Mike,95
2,Jane,92
3,Bella,98


In [140]:
## write from pd
df['gpa'] = [4.0, 3.8, 3.9, 3.3]
df.to_sql("transcript", con, if_exists="replace", index=False)
list(cur.execute("select * from transcript order by grade desc"))

[('John', 100, 4.0), ('Bella', 98, 3.3), ('Mike', 95, 3.8), ('Jane', 92, 3.9)]

In [149]:
# See the column names and types:
res = cur.execute("SELECT * FROM sqlite_master")
res.fetchall()

[('table',
  'transcript',
  'transcript',
  2,
  'CREATE TABLE "transcript" (\n"name" TEXT,\n  "grade" INTEGER,\n  "gpa" REAL\n)')]

## Appendix 2

In [12]:
first_artist = foo[2][1].split('and')[0]
first_artist = strip_string(first_artist)
song = foo[2][0]
song = strip_string(song)

lyric_url = "https://www.azlyrics.com/lyrics/" + first_artist + "/" + song + ".html"
lyric_url

'https://www.azlyrics.com/lyrics/eltonjohn/holdmecloser.html'

In [55]:
# read page:
read_pg = requests.get(lyric_url, headers=header)

In [56]:
soup = BeautifulSoup(read_pg.text, "html.parser")

In [15]:
## Dump in file for fun:
html = soup.prettify("utf-8")
with open("soupDump2.html", "wb") as file:
    file.write(html)

In [57]:
## extracting: <div class="col-xs-12 col-lg-8 text-center">
#s = soup.findAll("table")[0]
#rank = tr.find('span', {"itemprop": "position"}).text
s = soup.find('div', {'class': 'col-xs-12 col-lg-8 text-center'})


In [58]:
lyr = s.findAll('div')

In [77]:
for div in lyr[4:7]:
    print(div.text)






Hold me closer
Hold me closer
Hold me closer
Hold me closer

Saw you dancing out the ocean
Running fast along the sand
A spirit born of earth and water
Fire flying from your hands

Oh

Hold me closer, tiny dancer
Count the headlights on the highway
Lay me down in sheets of linen
You had a busy day today

Oh

Hold me closer
Hold me closer (Hold me closer)
Hold me closer
Hold me closer

There are caravans we follow
Drunken nights in dark hotels (Baby)
When chances breathe between the silence
When sex and love no longer gel

Oh

Hold me closer, tiny dancer
Count the headlights on the highway (Baby)
Lay me down in sheets of linen
You had a busy day today

Hold me closer
Hold me closer (Baby, baby)
Hold me closer
Hold me closer (Oh yeah)

Hold me closer
Hold me closer
(Hold me closer, me closer) Hold me closer
(Me closer, mm, yeah) Hold me closer







## Appendix

In [None]:
# Learning soup
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
soup = BeautifulSoup(read_pg.text, "html.parser")

In [None]:
## Dump in file for fun:
html = soup.prettify("utf-8")
with open("soupDump.html", "wb") as file:
    file.write(html)

In [24]:
tr = s.find("tbody").findAll("tr")[2]

In [72]:
#Song position!
tr.find('span', {"itemprop": "position"}).text

'3'

In [73]:
# Previous
tr.find('span', {'class':'Sub subStatsPrev'}).text

'\n\n                        (new)\n                    '

In [74]:
# Title
tr.find('span', {'itemprop':'name'}).text

'Hold Me Closer'

In [75]:
# artist(s)
artist = ""
first = True
for artists in tr.findAll('span', {'itemprop':'byArtist'}):
    if first:
        artist += artists.find('span', {'itemprop':'name'}).text
        first = False
    else:
        artist += " and " + artists.find('span', {'itemprop':'name'}).text
artist

'Elton John and Britney Spears'