**Data Science - Exam 1 (13/03/2023)**  
Student: Clémentine BLEUZE  
Number: 319 244 04

## Ex 1: Data extraction

In [20]:
import requests
from bs4 import BeautifulSoup

# start URL for bookshop
BOOK_URL = "https://www.decitre.fr/livres/litterature/"
# genres we will explore
GENRES = ["romans", "polars", "fantasy-sf", "pleiades", "livres-audio"]
WIKI_URL = "https://fr.wikipedia.org/"
UA = {"User-agent": "Mozilla/5.0"}

In [21]:
def extract_n_books(url:str, genre:str, n:int) -> list:
    """A function that explores books of a chosen genre and retrieves following information about the n first encountered books:
    - book's name
    - book's price
    - book's review score
    - book's author
    - book's description
    """
    
    # Get the page and parse it 
    page = requests.get(url, headers = UA)
    soup = BeautifulSoup(page.content, "html.parser")
    # Get all results from the page
    results = soup.select(".dct-product.clearfix.fiche-produit.product-availability-01")
    books = []

    for i in range(n):
        
        if i < len(results):
            # get name, price, genre, author of book
            book = {
                "name": results[i].select(".product-title")[0].text.strip("\n").strip(" "),
                "price": results[i].select(".final-price")[0].text.strip("\r").strip("\n").strip("\t").strip(" ").strip("\n").strip("\r"), 
                "genre": genre,
                "author": results[i].select(".trackme")[0].text
                }

            # get review score
            star_container = results[i].select(".star-container")
            if len(star_container) > 0 :
                stars = star_container[0].select(".dct-icon-star.filled")
                score = 0
                # computing the score
                for st in stars:
                    score += int(st["data-rating-part"]) / 100
                book["review-score"] = score

            else:
                book["review-score"] = None 

            # get description (from the book's page)
            book_page_url = results[i].select(".product-title")[0]["href"]
            book_page = requests.get(book_page_url, headers = UA)
            page_soup = BeautifulSoup(book_page.content, "html.parser")
            book["description"] = page_soup.select(".content")[0].text.strip("\n").strip(" ")

            books.append(book)
            
        else:
            book = {"name": None, "price": None, "genre": None, "author": None, "review-score": None, "description": None}
            books.append(book)

    
    return books

NB: The previous function doesn't allow the parsing of multiple pages (for example in case one wants to select 100 books per genre and needs to parse 2 pages). Draft for such an extension is in the following cell. But it hasn't been finished.

In [22]:
def extract_n_books_on_multiple_pages(url:str , genre:str,  n: int, books:list, page_nb:int ):

    page = requests.get(url + "?page=" + str(page_nb), headers = UA)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.select(".dct-product.clearfix.fiche-produit.product-availability-01")

    if len(results) > 0:
         
        for i in range(min(len(results), n)):

             # get name, price, genre, author of book
            book = {
                "name": results[i].select(".product-title")[0].text.strip("\n").strip(" "),
                "price": results[i].select(".final-price")[0].text.strip("\r").strip("\n").strip("\t").strip(" ").strip("\n").strip("\r"), 
                "genre": genre,
                "author": results[i].select(".trackme")[0].text
                }

            # get review score
            star_container = results[i].select(".star-container")
            if len(star_container) > 0 :
                stars = star_container[0].select(".dct-icon-star.filled")
                score = 0
                # computing the score
                for st in stars:
                    score += int(st["data-rating-part"]) / 100
                book["review-score"] = score

            else:
                book["review-score"] = None 

            # get description (from the book's page)
            book_page_url = results[i].select(".product-title")[0]["href"]
            book_page = requests.get(book_page_url, headers = UA)
            page_soup = BeautifulSoup(book_page.content, "html.parser")
            book["description"] = page_soup.select(".content")[0].text.strip("\n").strip(" ")

            books.append(book)
        
        n -= len(results)
    
    if n > 0: # if we still have some books to get on other pages
        try:
            books.extend(extract_n_books_on_multiple_pages(url, genre, n, books, page_nb + 1))
        except Exception as e:
            pass
    

In [23]:
#n = 10
#books = []
#for genre in GENRES:
    #extract_n_books_on_multiple_pages(BOOK_URL, genre, n, books, 1)

So we can now make a selection of books mixing various genres:

In [24]:
n = 10
books = []
for genre in GENRES:

    # get the URL of the first page to scrap
    if genre != "livres-audio":
        url = BOOK_URL + genre + "/meilleures-ventes.html"
    else: # for audio-books: no best-seller page
        url = BOOK_URL + genre + ".html"

    books.extend(extract_n_books(url, genre, n))

In [25]:
# Visualizing data using pandas Dataframes
import pandas as pd
books_df = pd.DataFrame(books)
books_df

Unnamed: 0,name,price,genre,author,review-score,description
0,Les femmes du bout du monde,"21,90 €",romans,Mélissa Da Costa,,"Si tu te demandes ce que nous faisons ainsi, l..."
1,Les douleurs fantômes,"9,70 €",romans,Mélissa Da Costa,4.5,"Rosalie, Gabriel, Tim, Anton et Ambre formaien..."
2,The Devil's Sons Tome 1,"19,90 €",romans,Chloé Wallerand,,Entrer dans un gang ? Jamais Avalone n'aurait ...
3,Tant que le café est encore chaud,"7,40 €",romans,Toshikazu Kawaguchi,3.5,Dans une petite ruelle de Tokyo se trouve Funi...
4,La décision,"8,70 €",romans,Karine Tuil,5.0,Mai 2016. Dans une aile ultrasécurisée du Pala...
5,Kilomètre zéro. Le chemin du bonheur,"7,90 €",romans,Maud Ankaoua,4.0,"Maëlle, directrice financière d'une start-up e..."
6,Les années glorieuses Le Grand Monde,"10,40 €",romans,Pierre Lemaitre,4.5,La famille Pelletier : trois histoires d'amour...
7,Plus jamais sans moi,"18,90 €",romans,Maud Ankaoua,,Et s'il était temps pour vous de rencontrer en...
8,Jamais plus,"7,60 €",romans,Colleen Hoover,4.5,Lily Blossom Bloom n'a pas eu une enfance très...
9,Tout le bleu du ciel,"10,90 €",romans,Mélissa Da Costa,4.5,"""Petitesannonces. fr : Jeune homme de 26 ans, ..."


Now trying to add more information using Wikipedia:

In [26]:
import wikipedia

for book in books:

    if book["author"]:
        try: 
            article = wikipedia.page(title=book["author"], auto_suggest=False)
            book["author-bio"] = article.summary

        except wikipedia.DisambiguationError as de:
            book["author-bio"] = None
            pass

        except wikipedia.PageError as pe:
            book["author-bio"] = None
            pass
    else:
        book["author-bio"] = None



  lis = BeautifulSoup(html).find_all('li')


In [27]:
# Example of a book with author biography
books[4]

{'name': 'La décision',
 'price': '8,70 €',
 'genre': 'romans',
 'author': 'Karine Tuil',
 'review-score': 5.0,
 'description': "Mai 2016. Dans une aile ultrasécurisée du Palais de justice, la juge Alma Revel doit se prononcer sur le sort d'un jeune homme suspecté d'avoir rejoint l'Etat islamique en Syrie. A ce dilemme professionnel s'en ajoute un autre, plus intime : mariée depuis plus de vingt ans à un écrivain à succès sur le déclin, Alma entretient une liaison avec l'avocat qui représente le mis en examen. Entre raison et déraison, ses choix risquent de bouleverser sa vie et celle du pays... Avec ce nouveau roman, Karine Tuil nous entraîne dans le quotidien de juges d'instruction antiterroristes, au coeur de l'âme humaine, dont les replis les plus sombres n'empêchent ni l'espoir ni la beauté.",
 'author-bio': 'Karine Tuil (May 3, 1972, Paris -), is a French novelist who has written several award-winning novels in French and English.  Her works have themes ranging from marriage and 

## Ex 2: Data Formatting

In [28]:
books_df["id"] = range(len(books))

In [29]:
# Then in json format
books_json = books_df.to_json(orient="index")

In [30]:
# Storing data into a json file
import json
with open("books.json", "w") as file:
    json.dump(books_json, file)

## Ex 3: Data Storage

We will now store this data into a MongoDB database, using pymongo. 

In [31]:
import pymongo
from pymongo import MongoClient

# Loading data from storage
with open("books.json", "r") as f:
    books = json.load(f)

In [32]:
d = json.loads(books)
d

{'0': {'name': 'Les femmes du bout du monde',
  'price': '21,90 €',
  'genre': 'romans',
  'author': 'Mélissa Da Costa',
  'review-score': None,
  'description': "Si tu te demandes ce que nous faisons ainsi, loin des hommes, je vais te dire : nous veillons sur notre petit univers, nous veillons les unes sur les autres. C'est ce que font les femmes du bout du monde. A la pointe sud de la Nouvelle-Zélande, dans la région isolée des Catlins, au coeur d'une nature sauvage, vivent Autumn et sa fille Milly. Sur ce dernier bastion de terre avant l'océan Austral et le pôle Sud, elles gèrent le camping Mutunga o te ao, le bout du monde en maori. Autumn et Milly forment un duo inséparable, jusqu'au jour où débarque Flore, une jeune parisienne en quête de rédemption... Hantées par le passé mais bercées par les vents et les légendes maories, ces trois femmes apprendront à se connaître, se pardonner et s'aimer. Mélissa Da Costa nous offre un voyage inoubliable à travers des paysages d'une stupéfian

In [33]:
books_list = [d[str(i)] for i in range(50)] # converting back to a list

For the following cell: I encountered localhost error that I couldn't solve (see error message below), maybe due to a recent change of computer (I use a computer borrowed from the University, so sometimes I can't get all the authorizations to install softwares or choose certain settings). I still hope the following code can be taken into account.

In [34]:
client = MongoClient("localhost", 27017)
with client:
    db = client.testdb
    db["books"].insert_many(books_list)

ServerSelectionTimeoutError: localhost:27017: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée, Timeout: 30s, Topology Description: <TopologyDescription id: 640f494ada38bc370c652e72, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée')>]>

## Ex 4: Querying the database

In [None]:
# How many books
client = MongoClient("localhost", 27017)
with client:
    db = client.testdb
    nb = db.books.count_documents({})
    print(f"There are {nb} books")

In [None]:
# How many books whose score is greater than 3
client = MongoClient("localhost", 27017)
with client:
    db = client.testdb
    nb = db.books.count_documents({'review-score': {'$gt': 3}})
    print(f"There are {nb} books")

In [None]:
# How many books whose description is longer than 50 words long
client = MongoClient("localhost", 27017)
with client:
    db = client.testdb
    books = db.books.find()

    nb = 0
    for book in books:
        # We'll assume that words are separated by spaces in a string
        if len(book["description"].split(" ")) > 50:
            nb += 1
    
    print(f"There are {nb} books whose description is longer than 50 words.")

In [None]:
#How many books whose price is less than 10 EUR?
client = MongoClient("localhost", 27017)
with client:
    db = client.testdb
    nb = db.books.count_documents({'price': {'$lt':10}})
    print(f"There are {nb} books")

In [None]:
# Which book is the more expensive?
client = MongoClient("localhost", 27017)
with client:
    db = client.testdb
    books = db.books.find().sort('price', DESCENDING)
    print(f"Book '{books[0]["name"]}' is the most expensive.")

In [None]:
# Which books have a score bigger than 3 and a description longer than 50 words?
client = MongoClient("localhost", 27017)
with client:
    db = client.testdb
    books = db.books.find({"review-score": {"$gt":3}})

    nb = 0
    for book in books:
        # We'll assume that words are separated by spaces in a string
        if len(book["description"].split(" ")) > 50:
            nb += 1
    
    print(f"There are {nb} books whose description is longer than 50 words and whose score is bigger than 3.")

## Ex 5: Adding RDF triples