<a href="https://colab.research.google.com/github/BStricks/music_information_retrieval/blob/master/music_webscrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web scraping the album review corpus

The primary purpose of this script is to crawl the pitchfork.com website for all album reviews and download the review text into a dataframe along with the artist and album name attributes. This methodology can be extended to include multiple other domains e.g. amazon reviews, rolling stone etc.

The secondary purpose was to trial a document matching algorithm on the newly created corpus; using a range of matching techniques the aim is to match a user's natuaral language query with the most appropriate album. 



# Section 1: web scraping

In [0]:
###mount drive
from google.colab import drive
drive.mount('/content/gdrive')

###change directory
%cd gdrive/My Drive/Colab Notebooks/album_reviews

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
/content/gdrive/My Drive/Colab Notebooks/album_reviews


In [0]:
###libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np
import pickle
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2019-09-27 06:48:26--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.86.69
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.86.69|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



## Pitchfork scrape

In [0]:
###webpages to scrape
pagelist = []
for i in range(1, 1000):
  pagelist.append('https://pitchfork.com/reviews/albums/?page='+str(i))

###create table for hyperlinks
master_table_pitchfork = pd.DataFrame(columns=['href', 'artist', 'album'])

###function to scrape hyperlinks and extract artist/album tags
for i in pagelist:

  page = requests.get(i)
  soup = BeautifulSoup(page.text, 'html.parser').find_all('div', attrs={"class":"review"})

  for div in soup:
    href = ['https://pitchfork.com/'+div.find('a',attrs={"class":"review__link"})['href']]
    artist = [div.find('li').text]
    album = [div.find('h2').text]

    new_table = pd.DataFrame(
        {'href': href,
        'artist': artist,
        'album': album
        })

    master_table_pitchfork = master_table_pitchfork.append(new_table)

In [0]:
print(len(master_table_pitchfork))
outfile = open('master_table_pitchfork','wb')
pickle.dump(master_table_pitchfork,outfile)
outfile.close()

11988


In [0]:
pickle_in = open("master_table_pitchfork","rb")
master_table_pitchfork = pickle.load(pickle_in)
print(len(master_table_pitchfork))

11988


In [0]:
###scrape webpage for album review text
review_text = []

for i in range(0,11988):
  
  href = master_table_pitchfork.iloc[i][0]
  page = requests.get(href)

  if not page:
    review_text.append("NULL")

  else: 
    soup = BeautifulSoup(page.text, 'html.parser').find_all('div', attrs={"class":"contents"})
  
    for div in soup:
    
      if div.text:
        review_text.append(div.text)

In [0]:
master_table_pitchfork = master_table_pitchfork.assign(review_text=review_text)

In [0]:
outfile = open('master_table_pitchfork','wb')
pickle.dump(master_table_pitchfork,outfile)
outfile.close()

In [0]:
pickle_in = open("master_table_pitchfork","rb")
master_table_pitchfork = pickle.load(pickle_in)
print(len(master_table_pitchfork))

## NME scrape

In [0]:
###webpages to scrape
pagelist = []
for i in range(0, 150):
  pagelist.append('https://www.nme.com/reviews/album/page/'+str(i))

###create table for hyperlinks
master_table_nme = pd.DataFrame(columns=['href', 'artist','album'])
href = []
artist = []
album = []
###function to scrape hyperlinks and extract artist/album tags
for i in pagelist:

  page = requests.get(i)
  soup = BeautifulSoup(page.text, 'html.parser').find_all('li', attrs={"class":"listing-item"})

  for s in soup:
    
    for a in s.find_all('a'):
      href.append(a['href'])
    
    for header in s.find_all("h3"):
      header_1 = header.text.strip()
      
      try: 
        artist1 = header_1.split(' –')[1]
        artist1 = header_1.split(' –')[0]
        artist.append(artist1)
      except:
        try:
          artist1 = header_1.split(' -')[1]
          artist1 = header_1.split(' -')[0]
          artist.append(artist1)
        except:
          artist.append(header_1)

      try: 
        album1 = header_1.split('\'')[1]
        album2 = album1.split('\'')[0]
        album.append(album2)
      except:
        try:
          album1 = header_1.split('‘')[1]
          album2 = album1.split('’')[0]
          album.append(album2)
        except:
          album.append(header_1)

artist = [a.replace('Album Review: ', '') for a in artist]
new_table = pd.DataFrame({'href': href,'artist': artist,'album': album})

master_table_nme = master_table_nme.append(new_table)

In [0]:
###scrape webpage for album review text
review_text = []

for i in range(0,4650):
  
  href = master_table_nme.iloc[i][0]
  page = requests.get(href)

  if not page:
    review_text.append("NULL")

  else: 
    soup = BeautifulSoup(page.text, 'html.parser').find_all('p')   
    sentences = []
    for p in soup:
        if p.text:
          para = str(p.text.strip())
          if para.startswith("window"):
            pass
          elif para.startswith("Release"):
            pass
          elif para.startswith("Record"):
            pass
          else:
            sentences.append(para)
  
  review_text.append(' '.join(sentences))

In [0]:
master_table_nme = master_table_nme.assign(review_text=review_text)

In [0]:
#combine
#master_table = master_table_pitchfork.append(master_table_nme)
print(len(master_table))


16638


In [0]:
#pickle
outfile = open('album_corpus','wb')
pickle.dump(master_table,outfile)
outfile.close()