# Project 4: Web scraping and text classification

The notebook explains how to create a ML classification model to predict the author of the lyrics scraped from the website https://www.lyrics.com. The notebook was developed as as study project for the Spiced Academy Data Science Bootcamp.

In [195]:
import requests
from bs4 import BeautifulSoup
import re
import os
import csv
import pandas as pd
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Web scraping author 1 (Bob Dylan)

In [17]:
#scraping Bob Dylan's songs
url_dylan = 'https://www.lyrics.com/artist/Bob-Dylan/4147'

In [18]:
response_dylan = requests.get(url_dylan)

In [19]:
#checking the status of the request
response_dylan.status_code

200

In [20]:
html_dylan = response_dylan.text

In [21]:
parsed_html_dylan = BeautifulSoup(html_dylan)

In [22]:
links_dylan = parsed_html_dylan.find_all('a')


In [23]:
# function to format the artist's name as included in the lyrics' links
# the name is inputed as a string and the white spaces are used to split the name

def format_artist_name (name):
    name_parts = name.split()
    name_length = len(name_parts)
    author_name = []
    for x in range (0, name_length):
        author_name.append(name_parts[x])
    author_str =""
    result= "+".join(author_name, )
    return result

In [24]:
dylan = format_artist_name("Bob Dylan")

In [25]:
# function to extract the urls for the lyric webpages

def get_lyric_url(links, artist):
    artist_name = format_artist_name(artist)
    
    lyric_url_list = []
    for link in links:
        try:
            if artist_name in link.get('href'):
                part_url = link.get('href')
                lyric_url_list.append("https://www.lyrics.com/track/" + part_url[7:])
        except:
            continue
    
    return lyric_url_list
    
       

lyric_urls_dylan = get_lyric_url(links_dylan, "Bob Dylan")


In [27]:
def get_lyrics(lyric_urls, artist):
    artist_name = format_artist_name(artist)
    current_wd = os.getcwd()
    new_d_name = artist_name.lower().replace("+", "_")
    artist_d = os.mkdir(current_wd+'/'+new_d_name)
    
    metadata = [["song_num", "song_title"]]
    for url in lyric_urls:
        r = requests.get(url)
        html_songpage = r.text
        parsed_html_songpage = BeautifulSoup(html_songpage)
        try:
            song_title = parsed_html_songpage.find_all("h1", attrs={'class': 'lyric-title'})[0].text
            song_text = parsed_html_songpage.find_all("pre")[0].text #, attrs={'class': 'lyric-body wselect-cnt'})
            filename = re.search("\d+",url).group()
            metadata_song = [filename,song_title]
            metadata.append(metadata_song)
            path = current_wd + '/'+ new_d_name + '/' + filename+".txt"
        
            with open(path, 'w') as file:
                file.write(song_text)
        except:
            continue
    
    path_metadata = current_wd + '/'+ new_d_name + '/' + "metadata" + ".csv"
    with open(path_metadata, 'w') as csv_file:
        csv_file_writer = csv.writer(csv_file)
        csv_file_writer.writerows(metadata)
        
        
        
        
          


In [None]:
#collecting Dylan's song lyrics
dylan_lyrics = get_lyrics(lyric_urls_dylan, "Bob Dylan")

In [34]:
## Web scraping author 2 (Bruce Springsteen)

In [33]:
#scraping Bruce Springsteen's songs
url_spreen = 'https://www.lyrics.com/artist/Bruce-Springsteen/5505'
response_spreen = requests.get(url_spreen)
html_spreen = response_spreen.text
parsed_html_spreen = BeautifulSoup(html_spreen)
links_spreen = parsed_html_spreen.find_all('a')
holiday = format_artist_name("Bruce Springsteen")
lyric_urls_spreen = get_lyric_url(links_spreen, "Bruce Springsteen")
spreen_lyrics = get_lyrics(lyric_urls_spreen, "Bruce Springsteen")

In [63]:
# Reading the Bob Dylan's data in a df
metadata_path_dylan = os.getcwd() + '/'+ "bob_dylan" + '/' + "metadata.csv"

#Generating a dataframe for the songs metadata
df_metadata_dylan = pd.read_csv(metadata_path_dylan)
df_metadata_dylan['author']= 'Bob Dylan'

#def read_metadata_author(author):
    

In [64]:
print(df_metadata_dylan.dtypes)

song_num       int64
song_title    object
author        object
dtype: object


In [65]:
df_metadata_dylan.head(3)

Unnamed: 0,song_num,song_title,author
0,36145301,Buckets of Rain,Bob Dylan
1,36145307,Idiot Wind,Bob Dylan
2,36145303,"If You See Her, Say Hello",Bob Dylan


In [66]:
txt_files = glob.glob("bob_dylan/*.txt")
current_wd = os.getcwd()

lyrics_dict_dylan ={ }
for file in txt_files:
    filename = re.search(r"\d+", file).group()
    txt_filename = filename + ".txt"
    with open (os.path.join(current_wd,"bob_dylan", txt_filename), 'r+') as f:
        lyrics = f.read()
        lyrics_dict_dylan.update({filename:lyrics})
        
df_lyrics_dylan = pd.DataFrame.from_dict(lyrics_dict_dylan, orient='index').reset_index()
df_lyrics_dylan.columns =["song_num", "song_lyrics"]
df_lyrics_dylan['song_num']= df_lyrics_dylan['song_num'].astype(np.int64)
df_lyrics_dylan.head(3)
print(df_lyrics_dylan.dtypes)



#def read_file_content(txt_folder):
    
    

song_num        int64
song_lyrics    object
dtype: object


In [67]:
df_dylan = df_metadata_dylan.merge(df_lyrics_dylan)

df_dylan.head(3)
print(df_dylan.shape)

(6546, 4)


In [68]:
# Reading the Bruce Springsteen's data in a df
metadata_path_spreen = os.getcwd() + '/'+ "bruce_springsteen" + '/' + "metadata.csv"
df_metadata_spreen = pd.read_csv(metadata_path_spreen)
df_metadata_spreen['author']= 'Bruce Springsteen'
txt_files = glob.glob("bruce_springsteen/*.txt")
current_wd = os.getcwd()

lyrics_dict_spreen ={ }
for file in txt_files:
    filename = re.search(r"\d+", file).group()
    txt_filename = filename + ".txt"
    with open (os.path.join(current_wd,"bruce_springsteen", txt_filename), 'r+') as f:
        lyrics = f.read()
        lyrics_dict_spreen.update({filename:lyrics})
        
df_lyrics_spreen = pd.DataFrame.from_dict(lyrics_dict_spreen, orient='index').reset_index()
df_lyrics_spreen.columns =["song_num", "song_lyrics"]
df_lyrics_spreen['song_num']= df_lyrics_spreen['song_num'].astype(np.int64)
df_lyrics_spreen.head(3)

print(df_lyrics_spreen.dtypes)
df_spreen = df_metadata_spreen.merge(df_lyrics_spreen)
print(df_spreen.shape)
df_spreen.head(3)


song_num        int64
song_lyrics    object
dtype: object
(2706, 4)


Unnamed: 0,song_num,song_title,author,song_lyrics
0,36456915,Dancing in the Dark,Bruce Springsteen,I get up in the evenin'\nAnd I ain't got nothi...
1,36456910,Badlands,Bruce Springsteen,Lights out tonight\nTrouble in the heartland\n...
2,36456909,Cover Me,Bruce Springsteen,"The times are tough now, just getting tougher\..."


In [69]:
# Joining the two authors dataframes
df = pd.concat([df_dylan, df_spreen])

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(["author"], axis=1), df["author"],
                                                    test_size=0.2, random_state=42)

In [71]:
X_train.head(3)

Unnamed: 0,song_num,song_title,song_lyrics
4756,22184408,Jokerman,Standing on the waters casting your bread\nWhi...
2302,21226112,The Times They Are a-Changin',Come gather 'round people\nWherever you roam\n...
5278,772620,Emotionally Yours,"Come baby, find me, come baby, remind me of wh..."


In [72]:
X_train['cleaned_lyric'] = X_train['song_lyrics'].str.replace("\d+|\n|[.,:;!?]|\s", " ")

In [73]:
text_list = X_train['cleaned_lyric'].tolist()

In [74]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/giudittaparolini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [98]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giudittaparolini/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [99]:
nltk_stopwords = stopwords.words("english")

In [100]:
my_stopwords = nltk_stopwords+['aa']

In [120]:
tokens_list = [word_tokenize(text) for text in text_list]

In [125]:
cleaned_tokens = []
for text in tokens_list:
    cleaned_tokens_subl = []
    for token in text:
        if (token not in my_stopwords) and (len(token)>3):
            cleaned_tokens_subl.append(token)
    cleaned_tokens.append(cleaned_tokens_subl)
        

In [126]:
cleaned_tokens_str_format = []
for element in cleaned_tokens:
    my_str = " ".join(map(str, element))
    cleaned_tokens_str_format.append(my_str)



In [161]:
X_train['cleaned_lyric'] = pd.Series(data=cleaned_tokens_str_format, index = X_train.index)

In [168]:
X_train['cleaned_lyric'] = X_train['cleaned_lyric'].astype("string")

In [174]:
X_train.dtypes

song_num          int64
song_title       object
song_lyrics      object
cleaned_lyric    string
dtype: object

In [175]:
y_train.head(3)

4756    Bob Dylan
2302    Bob Dylan
5278    Bob Dylan
Name: author, dtype: object

In [176]:
list_cleaned_lyrics = X_train['cleaned_lyric'].tolist()

In [177]:
list_cleaned_lyrics[0]

"Standing waters casting bread While eyes idol iron head glowing Distant ships sailing mist born snake fists hurricane blowing Freedom around corner truth good Jokerman dance nightingale tune Bird high light moon Jokerman swiftly sets rise goodbye Fools rush angels fear tread Both futures full dread show Shedding layer skin Keeping step ahead persecutor within Jokerman dance nightingale tune Bird high light moon Jokerman mountains walk clouds Manipulator crowds dream twister going Sodom Gomorrah care nobody would want marry sister Friend martyr friend woman shame look fiery furnace rich without name Jokerman dance nightingale tune Bird high light moon Jokerman Well Book Leviticus Deuteronomy jungle teachers smoke twilight milk-white steed Michelangelo indeed could carved features Resting fields turbulent space Half asleep near stars small licking face Jokerman dance nightingale tune Bird high light moon Jokerman Well rifleman stalking sick lame Preacherman seeks first uncertain Nightst

In [178]:
labels = y_train.tolist()

In [275]:
pipeline = Pipeline([('vect', CountVectorizer(lowercase=False)),
                     ('tfidf', TfidfVectorizer(lowercase=False))
                    ])

In [276]:
pipeline

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=False, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=No...
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=False, max_df=1.0, max_features=None,
   

In [277]:
X = pipeline.fit_transform(list_cleaned_lyrics)

TypeError: expected string or bytes-like object

In [208]:
X.shape

(7401, 12631)

In [202]:
pd.DataFrame(X.todense(), index=labels)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12621,12622,12623,12624,12625,12626,12627,12628,12629,12630
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bob Dylan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [247]:
model = LogisticRegression().fit(X, y_train)


In [260]:
model.score(X, y_train)

0.9867585461424132

In [261]:
X_test['cleaned_lyric'] = X_test['song_lyrics'].str.replace("\d+|\n|[.,:;!?]|\s", " ")

In [262]:
text_list_test = X_test['cleaned_lyric'].tolist()

In [263]:
tokens_list_test = [word_tokenize(text) for text in text_list_test]

In [264]:
cleaned_tokens_test = []
for text in tokens_list_test:
    cleaned_tokens_subl = []
    for token in text:
        if (token not in my_stopwords) and (len(token)>3):
            cleaned_tokens_subl.append(token)
    cleaned_tokens_test.append(cleaned_tokens_subl)

In [265]:
cleaned_tokens_str_format_test = []
for element in cleaned_tokens_test:
    my_str = " ".join(map(str, element))
    cleaned_tokens_str_format_test.append(my_str)

In [266]:
X_test['cleaned_lyric'] = pd.Series(data=cleaned_tokens_str_format_test, index = X_test.index)

In [267]:
list_cleaned_lyrics_test = X_test['cleaned_lyric'].tolist()

In [268]:
X_test_transformed = pipeline.transform(list_cleaned_lyrics_test)

In [269]:
X_test_transformed.shape

(1851, 12631)

In [273]:
ypred = model.predict(X_test_transformed)    

In [272]:
model.score(X_test_transformed, y_test)

0.9681253376553215

In [274]:
    

probs = model.predict_proba(X_test_transformed)  
pd.DataFrame(probs)

Unnamed: 0,0,1
0,0.978954,0.021046
1,0.935657,0.064343
2,0.234201,0.765799
3,0.053787,0.946213
4,0.942964,0.057036
...,...,...
1846,0.961473,0.038527
1847,0.953887,0.046113
1848,0.970938,0.029062
1849,0.299031,0.700969
