# Project 4: Web scraping and text classification

The notebook explains how to create a ML classification model to predict the author of the lyrics scraped from the website https://www.lyrics.com. The notebook was developed as as study project for the Spiced Academy Data Science Bootcamp.

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Web scraping author 1 (Bob Dylan)

In [17]:
#scraping Bob Dylan's songs
url_dylan = 'https://www.lyrics.com/artist/Bob-Dylan/4147'

In [18]:
response_dylan = requests.get(url_dylan)

In [19]:
#checking the status of the request
response_dylan.status_code

200

In [20]:
html_dylan = response_dylan.text

In [21]:
parsed_html_dylan = BeautifulSoup(html_dylan)

In [22]:
links_dylan = parsed_html_dylan.find_all('a')


In [23]:
# function to format the artist's name as included in the lyrics' links
# the name is inputed as a string and the white spaces are used to split the name

def format_artist_name (name):
    name_parts = name.split()
    name_length = len(name_parts)
    author_name = []
    for x in range (0, name_length):
        author_name.append(name_parts[x])
    author_str =""
    result= "+".join(author_name, )
    return result

In [24]:
dylan = format_artist_name("Bob Dylan")

In [25]:
# function to extract the urls for the lyric webpages

def get_lyric_url(links, artist):
    artist_name = format_artist_name(artist)
    
    lyric_url_list = []
    for link in links:
        try:
            if artist_name in link.get('href'):
                part_url = link.get('href')
                lyric_url_list.append("https://www.lyrics.com/track/" + part_url[7:])
        except:
            continue
    
    return lyric_url_list
    
       

lyric_urls_dylan = get_lyric_url(links_dylan, "Bob Dylan")


In [27]:
def get_lyrics(lyric_urls, artist):
    artist_name = format_artist_name(artist)
    current_wd = os.getcwd()
    new_d_name = artist_name.lower().replace("+", "_")
    artist_d = os.mkdir(current_wd+'/'+new_d_name)
    
    metadata = [["song_num", "song_title"]]
    for url in lyric_urls:
        r = requests.get(url)
        html_songpage = r.text
        parsed_html_songpage = BeautifulSoup(html_songpage)
        try:
            song_title = parsed_html_songpage.find_all("h1", attrs={'class': 'lyric-title'})[0].text
            song_text = parsed_html_songpage.find_all("pre")[0].text #, attrs={'class': 'lyric-body wselect-cnt'})
            filename = re.search("\d+",url).group()
            metadata_song = [filename,song_title]
            metadata.append(metadata_song)
            path = current_wd + '/'+ new_d_name + '/' + filename+".txt"
        
            with open(path, 'w') as file:
                file.write(song_text)
        except:
            continue
    
    path_metadata = current_wd + '/'+ new_d_name + '/' + "metadata" + ".csv"
    with open(path_metadata, 'w') as csv_file:
        csv_file_writer = csv.writer(csv_file)
        csv_file_writer.writerows(metadata)
        
        
        
        
          


In [None]:
#collecting Dylan's song lyrics
dylan_lyrics = get_lyrics(lyric_urls_dylan, "Bob Dylan")

## Web scraping author 2 (Bruce Springsteen)

In [319]:
#scraping Bruce Springsteen's songs
url_sprin = 'https://www.lyrics.com/artist/Bruce-Springsteen/5505'
response_sprin = requests.get(url_sprin)
html_sprin = response_sprin.text
parsed_html_sprin = BeautifulSoup(html_sprin)
links_sprin = parsed_html_sprin.find_all('a')
holiday = format_artist_name("Bruce Springsteen")
lyric_urls_sprin = get_lyric_url(links_sprin, "Bruce Springsteen")
sprin_lyrics = get_lyrics(lyric_urls_sprin, "Bruce Springsteen")

NameError: name 'format_artist_name' is not defined

## Reading the scraped data

In [3]:
# Reading the Bob Dylan's data in a df
metadata_path_dylan = os.getcwd() + '/'+ "bob_dylan" + '/' + "metadata.csv"

#Generating a dataframe for the songs metadata
df_metadata_dylan = pd.read_csv(metadata_path_dylan)
df_metadata_dylan['author']= 'Bob Dylan'

#def read_metadata_author(author):
    

In [4]:
print(df_metadata_dylan.dtypes)

song_num       int64
song_title    object
author        object
dtype: object


In [5]:
df_metadata_dylan.head(3)

Unnamed: 0,song_num,song_title,author
0,36145301,Buckets of Rain,Bob Dylan
1,36145307,Idiot Wind,Bob Dylan
2,36145303,"If You See Her, Say Hello",Bob Dylan


In [6]:
txt_files = glob.glob("bob_dylan/*.txt")
current_wd = os.getcwd()

lyrics_dict_dylan ={ }
for file in txt_files:
    filename = re.search(r"\d+", file).group()
    txt_filename = filename + ".txt"
    with open (os.path.join(current_wd,"bob_dylan", txt_filename), 'r+') as f:
        lyrics = f.read()
        lyrics_dict_dylan.update({filename:lyrics})
        
df_lyrics_dylan = pd.DataFrame.from_dict(lyrics_dict_dylan, orient='index').reset_index()
df_lyrics_dylan.columns =["song_num", "song_lyrics"]
df_lyrics_dylan['song_num']= df_lyrics_dylan['song_num'].astype(np.int64)
df_lyrics_dylan.head(3)
print(df_lyrics_dylan.dtypes)



#def read_file_content(txt_folder):
    
    

song_num        int64
song_lyrics    object
dtype: object


In [7]:
df_dylan = df_metadata_dylan.merge(df_lyrics_dylan)

df_dylan.head(3)
print(df_dylan.shape)

(6546, 4)


In [8]:
# Reading the Bruce Springsteen's data in a df
metadata_path_sprin = os.getcwd() + '/'+ "bruce_springsteen" + '/' + "metadata.csv"
df_metadata_sprin = pd.read_csv(metadata_path_sprin)
df_metadata_sprin['author']= 'Bruce Springsteen'
txt_files = glob.glob("bruce_springsteen/*.txt")
current_wd = os.getcwd()

lyrics_dict_sprin ={ }
for file in txt_files:
    filename = re.search(r"\d+", file).group()
    txt_filename = filename + ".txt"
    with open (os.path.join(current_wd,"bruce_springsteen", txt_filename), 'r+') as f:
        lyrics = f.read()
        lyrics_dict_sprin.update({filename:lyrics})
        
df_lyrics_sprin = pd.DataFrame.from_dict(lyrics_dict_sprin, orient='index').reset_index()
df_lyrics_sprin.columns =["song_num", "song_lyrics"]
df_lyrics_sprin['song_num']= df_lyrics_sprin['song_num'].astype(np.int64)
df_lyrics_sprin.head(3)

print(df_lyrics_sprin.dtypes)
df_sprin = df_metadata_sprin.merge(df_lyrics_sprin)
print(df_sprin.shape)
df_sprin.head(3
              )


song_num        int64
song_lyrics    object
dtype: object
(2706, 4)


Unnamed: 0,song_num,song_title,author,song_lyrics
0,36456915,Dancing in the Dark,Bruce Springsteen,I get up in the evenin'\nAnd I ain't got nothi...
1,36456910,Badlands,Bruce Springsteen,Lights out tonight\nTrouble in the heartland\n...
2,36456909,Cover Me,Bruce Springsteen,"The times are tough now, just getting tougher\..."


In [9]:
# Joining the two authors dataframes
df = pd.concat([df_dylan, df_sprin])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(["author"], axis=1), df["author"],
                                                    test_size=0.2, random_state=42)

In [11]:
X_train.head(3)

Unnamed: 0,song_num,song_title,song_lyrics
4756,22184408,Jokerman,Standing on the waters casting your bread\nWhi...
2302,21226112,The Times They Are a-Changin',Come gather 'round people\nWherever you roam\n...
5278,772620,Emotionally Yours,"Come baby, find me, come baby, remind me of wh..."


In [12]:
X_train['cleaned_lyric'] = X_train['song_lyrics'].str.replace("\d+|\n|[.,:;!?]|\s", " ")

In [13]:
text_list = X_train['cleaned_lyric'].tolist()

In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giudittaparolini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giudittaparolini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
nltk_stopwords = stopwords.words("english")

In [17]:
my_stopwords = nltk_stopwords+['aa']

In [None]:
tokens_list = [word_tokenize(text) for text in text_list]

In [None]:
cleaned_tokens = []
for text in tokens_list:
    cleaned_tokens_subl = []
    for token in text:
        if (token not in my_stopwords) and (len(token)>3):
            cleaned_tokens_subl.append(token)
    cleaned_tokens.append(cleaned_tokens_subl)
        

In [None]:
cleaned_tokens_str_format = []
for element in cleaned_tokens:
    my_str = " ".join(map(str, element))
    cleaned_tokens_str_format.append(my_str)



In [None]:
X_train['cleaned_lyric'] = X_train['cleaned_lyric'].astype("string")

In [None]:
X_train.dtypes

In [242]:
y_train.head(3)

4756    Bob Dylan
2302    Bob Dylan
5278    Bob Dylan
Name: author, dtype: object

In [None]:
list_cleaned_lyrics = X_train['cleaned_lyric'].tolist()

In [None]:
labels = y_train.tolist()

In [None]:
vect = TfidfVectorizer()
X =  vect.fit_transform(list_cleaned_lyrics)

In [None]:
X.shape

In [275]:
pd.DataFrame(X.todense(), index=labels)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10719,10720,10721,10722,10723,10724,10725,10726,10727,10728
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Fitting a logistic regression model

In [324]:
model = LogisticRegression(class_weight='balance').fit(X, y_train)

In [325]:
model.score(X, y_train)

0.9858127280097284

In [326]:
X_test['cleaned_lyric'] = X_test['song_lyrics'].str.replace("\d+|\n|[.,:;!?]|\s", " ")

In [327]:
text_list_test = X_test['cleaned_lyric'].tolist()

In [328]:
tokens_list_test = [word_tokenize(text) for text in text_list_test]

In [329]:
cleaned_tokens_test = []
for text in tokens_list_test:
    cleaned_tokens_subl = []
    for token in text:
        if (token not in my_stopwords) and (len(token)>3):
            cleaned_tokens_subl.append(token)
    cleaned_tokens_test.append(cleaned_tokens_subl)

In [330]:
cleaned_tokens_str_format_test = []
for element in cleaned_tokens_test:
    my_str = " ".join(map(str, element))
    cleaned_tokens_str_format_test.append(my_str)

In [331]:
X_test['cleaned_lyric'] = pd.Series(data=cleaned_tokens_str_format_test, index = X_test.index)

In [332]:
list_cleaned_lyrics_test = X_test['cleaned_lyric'].tolist()

In [333]:
X_test_transformed = vect.transform(list_cleaned_lyrics_test)

In [334]:
X_test_transformed.shape

(1851, 10729)

In [335]:
ypred = model.predict(X_test_transformed)    

In [336]:
ypred.shape

(1851,)

In [337]:
model.score(X_test_transformed,y_test)

0.968665586169638

In [338]:
probs = model.predict_proba(X_test_transformed)  



In [339]:
df_probs = pd.DataFrame(probs)

In [340]:
type(ypred)

numpy.ndarray

In [341]:
ser_ypred = pd.Series(ypred) 

In [342]:
pred_df = df_probs.merge(ser_ypred.rename('ypred'), left_index=True, right_index=True)

In [343]:
pred_df

Unnamed: 0,0,1,ypred
0,0.981977,0.018023,Bob Dylan
1,0.926587,0.073413,Bob Dylan
2,0.244434,0.755566,Bruce Springsteen
3,0.052964,0.947036,Bruce Springsteen
4,0.943990,0.056010,Bob Dylan
...,...,...,...
1846,0.960416,0.039584,Bob Dylan
1847,0.952168,0.047832,Bob Dylan
1848,0.976054,0.023946,Bob Dylan
1849,0.327900,0.672100,Bruce Springsteen


In [355]:
y_test.index = pred_df.index

In [356]:
pred_df['y_true'] = y_test

In [357]:
pred_df

Unnamed: 0,0,1,ypred,y_true
0,0.981977,0.018023,Bob Dylan,Bob Dylan
1,0.926587,0.073413,Bob Dylan,Bob Dylan
2,0.244434,0.755566,Bruce Springsteen,Bruce Springsteen
3,0.052964,0.947036,Bruce Springsteen,Bruce Springsteen
4,0.943990,0.056010,Bob Dylan,Bob Dylan
...,...,...,...,...
1846,0.960416,0.039584,Bob Dylan,Bob Dylan
1847,0.952168,0.047832,Bob Dylan,Bob Dylan
1848,0.976054,0.023946,Bob Dylan,Bob Dylan
1849,0.327900,0.672100,Bruce Springsteen,Bruce Springsteen


In [358]:
pred_df.index

RangeIndex(start=0, stop=1851, step=1)

In [359]:
X_test.index

Int64Index([ 586, 4890, 2559, 1630, 3064, 1145, 6145,  289,  620, 6066,
            ...
             297, 1578, 5356,  445, 2377, 3487, 6156, 5255, 2217, 2650],
           dtype='int64', length=1851)

In [360]:
pred_check = pd.concat([X_test.reset_index(drop=True),pred_df.reset_index(drop=True)], axis=1)


In [361]:
pred_check

Unnamed: 0,song_num,song_title,song_lyrics,cleaned_lyric,0,1,ypred,y_true
0,32346969,Like a Rolling Stone [Take 5] [remake] [Rehear...,Once upon a time you dressed so fine\nThrew th...,Once upon time dressed fine Threw bums dime pr...,0.981977,0.018023,Bob Dylan,Bob Dylan
1,1906865,You Ain't Going Nowhere,Clouds so swift\nRain won’t lift\nGate won’t c...,Clouds swift Rain lift Gate close Railings fro...,0.926587,0.073413,Bob Dylan,Bob Dylan
2,14251760,Adam Raised a Cain,In the summer that I was baptized\nMy father h...,summer baptized father held side water said cr...,0.244434,0.755566,Bruce Springsteen,Bruce Springsteen
3,8308531,Darkness on the Edge of Town,Well they're still racing out at the trestles\...,Well still racing trestles blood never burned ...,0.052964,0.947036,Bruce Springsteen,Bruce Springsteen
4,5440027,I Shall Be Released,They say every man must need protection\nThey ...,They every must need protection They every mus...,0.943990,0.056010,Bob Dylan,Bob Dylan
...,...,...,...,...,...,...,...,...
1846,8304717,Can't Wait,I can't wait\nWait for you to change your mind...,wait Wait change mind late tryin walk line Wel...,0.960416,0.039584,Bob Dylan,Bob Dylan
1847,5059002,Just Like Tom Thumb's Blues,When you're lost in the rain in Juarez when it...,When lost rain Juarez Easter time gravity fail...,0.952168,0.047832,Bob Dylan,Bob Dylan
1848,2598857,Up to Me [#],Everything went from bad to worse\nMoney never...,Everything went worse Money never changed thin...,0.976054,0.023946,Bob Dylan,Bob Dylan
1849,217862,Stand on It,Well Jimmy Lee was hookin' 'round the far turn...,Well Jimmy hookin 'round turn funky southern F...,0.327900,0.672100,Bruce Springsteen,Bruce Springsteen


In [362]:
compare = pred_check[pred_check['ypred'] != pred_check['y_true'] ]

In [363]:
compare

Unnamed: 0,song_num,song_title,song_lyrics,cleaned_lyric,0,1,ypred,y_true
213,33804694,War,Oh no-there's got to be a better way \nSay it ...,no-there better There better way-yeah What goo...,0.715371,0.284629,Bob Dylan,Bruce Springsteen
247,17551298,When Will I Be Loved,I've been cheated\nBeen mistreated\nWhen will ...,cheated Been mistreated When loved pushed 'rou...,0.908514,0.091486,Bob Dylan,Bruce Springsteen
248,37553324,Rainmaker,Parched crops dying 'neath a dead sun\nWe've b...,Parched crops dying 'neath dead praying good c...,0.515949,0.484051,Bob Dylan,Bruce Springsteen
250,31902932,Empty Sky,I woke up this morning\nI could barely breathe...,woke morning could barely breathe Just empty i...,0.533369,0.466631,Bob Dylan,Bruce Springsteen
269,32811297,War,Oh no-there's got to be a better way \nSay it ...,no-there better There better way-yeah What goo...,0.715371,0.284629,Bob Dylan,Bruce Springsteen
271,17776806,Angel Eyes,Hey drink up all you people\nOrder anything yo...,drink people Order anything happy people laugh...,0.736302,0.263698,Bob Dylan,Bruce Springsteen
282,25412961,Easy Money,"You put on your coat, I’ll put on my hat\nYou ...",coat dress tonight honey going town looking ea...,0.505175,0.494825,Bob Dylan,Bruce Springsteen
308,18757743,The Ghost of Tom Joad,Men walking 'long the railroad tracks\nGoing s...,walking 'long railroad tracks Going someplace ...,0.060859,0.939141,Bruce Springsteen,Bob Dylan
310,32587718,Chicken Shack Boogie,"Hello everybody this cat is back,\nLooking for...",Hello everybody back Looking place called Chic...,0.565816,0.434184,Bob Dylan,Bruce Springsteen
321,32779930,Held Up Without a Gun,My life flows on in endless song\nAbove earth'...,life flows endless song Above earth lamentatio...,0.818176,0.181824,Bob Dylan,Bruce Springsteen


In [364]:
compare.shape

(58, 8)

In [365]:
pred_check.columns

Index([     'song_num',    'song_title',   'song_lyrics', 'cleaned_lyric',
                     0,               1,         'ypred',        'y_true'],
      dtype='object')

In [366]:
pred_check

Unnamed: 0,song_num,song_title,song_lyrics,cleaned_lyric,0,1,ypred,y_true
0,32346969,Like a Rolling Stone [Take 5] [remake] [Rehear...,Once upon a time you dressed so fine\nThrew th...,Once upon time dressed fine Threw bums dime pr...,0.981977,0.018023,Bob Dylan,Bob Dylan
1,1906865,You Ain't Going Nowhere,Clouds so swift\nRain won’t lift\nGate won’t c...,Clouds swift Rain lift Gate close Railings fro...,0.926587,0.073413,Bob Dylan,Bob Dylan
2,14251760,Adam Raised a Cain,In the summer that I was baptized\nMy father h...,summer baptized father held side water said cr...,0.244434,0.755566,Bruce Springsteen,Bruce Springsteen
3,8308531,Darkness on the Edge of Town,Well they're still racing out at the trestles\...,Well still racing trestles blood never burned ...,0.052964,0.947036,Bruce Springsteen,Bruce Springsteen
4,5440027,I Shall Be Released,They say every man must need protection\nThey ...,They every must need protection They every mus...,0.943990,0.056010,Bob Dylan,Bob Dylan
...,...,...,...,...,...,...,...,...
1846,8304717,Can't Wait,I can't wait\nWait for you to change your mind...,wait Wait change mind late tryin walk line Wel...,0.960416,0.039584,Bob Dylan,Bob Dylan
1847,5059002,Just Like Tom Thumb's Blues,When you're lost in the rain in Juarez when it...,When lost rain Juarez Easter time gravity fail...,0.952168,0.047832,Bob Dylan,Bob Dylan
1848,2598857,Up to Me [#],Everything went from bad to worse\nMoney never...,Everything went worse Money never changed thin...,0.976054,0.023946,Bob Dylan,Bob Dylan
1849,217862,Stand on It,Well Jimmy Lee was hookin' 'round the far turn...,Well Jimmy hookin 'round turn funky southern F...,0.327900,0.672100,Bruce Springsteen,Bruce Springsteen


In [None]:
user = input()

In [None]:
user_t = vect.transform( [user] ) 

In [None]:
model.predict(user_t)

## Naive Bayes Model

In [2]:
mnb = MultinomialNB().fit(X, y_train)

NameError: name 'X' is not defined