# 0. Libraries import

In [None]:
import time
import requests
from bs4 import BeautifulSoup
import sqlite3
from selenium import webdriver

# 1. Accessing the data

The Top 100 page shows the list of the SensCritique users Top 100 favorites albums. On this page are accessible some data on each album but some relevant information are only displayed on each individual album page.
Thus, we need to gather all the urls of these individual pages.

## Connection to the main page

As the page we are willing to scrap only display the first half of the albums when first loaded, we use Selenium which is an automated browser control librariesto automatically scroll to the bottom of the page.
Then, we wait a few seconds thanks to the *time* library to be sure the second half of the page is displayed before we start scraping the data.

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.senscritique.com/musique/tops/top100-des-top10')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(10)

## Parsing the main page

We use BeautifulSoup to parse the HTML code we stored in the *driver* variable

In [None]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

driver.quit()

## Gathering all the individual album page urls

As all the information we are seeking are not displayed on the Top 100 page, we need to scrap those data in the individual page of each album. In order to do so, we collect url of all the clickable links corresponding to each album.

All the pages have the same url structure, so we concatenate both the SensCritique main site url and the extensions we collected, corresponding to each album. Finally, we obtain a list with the urls of all the Top 100 album.

In [None]:
album_titles = soup.find_all('a', {
    'class': 'Text__SCText-sc-kgt5u3-0 Link__SecondaryLink-sc-1vfcbn2-1 gwWwBt eDKWEX '
             'ProductListItem__StyledProductTitle-sc-ico7lo-3 ivaIVy'},
                             href=True)

href = []

for i in range(len(album_titles)):
    href.append("https://www.senscritique.com" + album_titles[i]['href'])

# 2. Individual page scraping

## List initialization

On each album page, we use BeautifulSoup to collect the following data :
- The title
- The publication year
- The artist name
- The number of ratings
- The number of time the album was added to a user registered ones
- The global rating
- The number of time the album was added to a user favorites ones

We first initialize the lists that will be containing all the scraped data.

In [None]:
title_list = []
publication_year_list = []
artist_list = []
number_of_rating_list = []
global_rating_list = []
registered_number_list = []
favorite_number_list = []

## Scraping each album data

In [None]:
for i in range(len(href)):
    html = requests.get(href[i])

    soup = BeautifulSoup(html.content, 'html.parser')

    album_title = soup.find_all('h1')
    title_list.append(album_title[0]['title'])

    publication_year = soup.find_all('p', {
        'class': 'Text__SCTitle-sc-kgt5u3-1 CoverProductInfos__StyledText-sc-cbcfd0-10 hDmvGP fnFfaR'})
    publication_year_list.append(publication_year[0].text)

    artist = soup.find_all('span', href=True)
    artist_list.append(artist[0].text)

    number_of_rating_and_registered = soup.find_all('p', {
        'class': 'Text__SCText-sc-kgt5u3-0 Stats__Text-sc-l0a962-2 hrLruZ IwdGM'})
    number_of_rating_list.append(number_of_rating_and_registered[0].text)
    registered_number_list.append(number_of_rating_and_registered[1].text)

    global_rating = soup.find_all('div', {'class': 'Rating__GlobalRating-sc-1rkvzid-5 eCIKNi'})
    global_rating_list.append(global_rating[0].text)

    favorite_number = soup.find_all('p', {'class': 'Text__SCText-sc-kgt5u3-0 Stats__Text-sc-l0a962-2 hrLruZ FlIXF'})
    favorite_number_list.append(favorite_number[0].text)

After being scraped, data are stored in the corresponding list.

# 3. Storing data in a database

## Creating the database

We first create a database called *album_top_100.db* and then create a table with all the required columns.

In [None]:
conn = sqlite3.connect('album_top_100.db')

conn.execute('''
    CREATE TABLE album_top_100
    (ranking INTEGER PRIMARY KEY,
    title VARCHAR(100),
    publication_year INTEGER,
    artist VARCHAR(100),
    global_rating FLOAT,
    rating_number INTEGER,
    registered_number INTEGER,
    favorite_number INTEGER)
''')

## Inserting data in the database

After creating the table, we add all the data we gathered to the database for each album.

In [None]:
for i in range(len(href)):
    conn.execute(
        'INSERT INTO album_top_100 (ranking, title, publication_year, artist, global_rating, rating_number, '
        'registered_number, favorite_number) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
        (
            i + 1, title_list[i], publication_year_list[i], artist_list[i], global_rating_list[i],
            number_of_rating_list[i],
            registered_number_list[i], favorite_number_list[i]))

## Database exiting

To update the database, we commit our changes and then end the connection.

In [None]:
conn.commit()

conn.close()