# <center> webscraping

### première partie du projet bloc 1

- Webscraping du site [www.allocine.fr](https://www.allocine.fr/films/)

![filtres](images/filtresSMALL.png)

## Sources :
**Beautiful Soup** :
[beautiful-soup-4](https://beautiful-soup-4.readthedocs.io/en/latest/)<br>
[beautiful-soup-4.readthedocs.io](https://beautiful-soup-4.readthedocs.io/en/latest/#searching-the-tree)<br>

**Selenium** :<br>
[selenium-python.readthedocs.io](https://selenium-python.readthedocs.io/locating-elements.html)<br>
[selenium.dev/documentation](https://www.selenium.dev/documentation/webdriver/elements/information/)<br>
[selenium.dev/documentation/finders/](https://www.selenium.dev/documentation/webdriver/elements/finders/)<br>
[geeksforgeeks.org/get_property-selenium/](https://www.geeksforgeeks.org/get_property-element-method-selenium-python/)<br>

Les liens sont sûrement générés aléatoirement dynamiquement, on peut utiliser XPath avec selenium<br>
ou bien avec lxml ??<br>

In [16]:
%reset

In [17]:
import os
import re
import httpx
import requests
import pandas as pd

from bs4 import BeautifulSoup
from IPython.display import display
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options = options)

%config IPCompleter.greedy = True

url_site = 'https://www.allocine.fr/'
url_films = 'https://www.allocine.fr/films/'

### On scrape tous les genres de film

In [3]:
# Scrap all categories
r = requests.get(url_films, auth=('user', 'pass'))
if r.status_code != 200:
    print("url_site error")
    
soup = BeautifulSoup(r.content, 'html.parser')
print(type(soup))

categories = []
elt_categories = soup.find('div', class_='filter-entity-section')
for elt in elt_categories.find_all('li'):
    #print(elt.prettify())
    categories.append(elt.a.text)

print("Nb categories :", len(categories))
df_categories = pd.Series(categories)

dict_n_cat = {k:v for k, v in enumerate(categories)}
dict_cat_n = {v:k for v, k in dict_n_cat.items()}

<class 'bs4.BeautifulSoup'>
Nb categories : 37


### On scrape les films par période
[1980 - 1989] puis [1990 - 1999] ...

In [4]:
# Scrap url of years we want to scrap the movies
# Not Working
# I cannot get the url by scraping
decades_to_scrap = list([str(item) for item in range(1980, 2020, 10)])
decades_to_scrap = list([str(item) for item in range(1980, 1989, 10)])
print(decades_to_scrap)
elt_years = elt_categories.find_next_sibling()
#print(eltYears.prettify())
lstUrl = []
elt_cur = elt_years.find('li')

while elt_cur:
    text = elt_cur.span.text
    #print(elt_cur)
    #print(elt_cur.span.get('href')) # NOT WORKING because url are 'decorated'
    if text[:4] in decades_to_scrap:
        print("**", text)
    else:
        print(elt_cur.span.text)
    elt_cur = elt_cur.find_next_sibling()

['1980']
2030 - 2039
2020 - 2029
2010 - 2019
2000 - 2009
1990 - 1999
** 1980 - 1989
1970 - 1979
1960 - 1969
1950 - 1959
1940 - 1949
1930 - 1939
1920 - 1929
1910 - 1919
1900 - 1909
1890 - 1899


### Autre méthode utilisant Selenium
Puisque nous n'arrivons pas à récupérer les urls nous allons utiliser Selenium qui permet entre autre :
- d'utiliser les XPath (contrairement à Beautiful Soup)
- de récupérer certains élements 'décorés' par exemple des urls

In [18]:
def number_pages_per_year(soup_year):
    ''' Return the number of pages for one year'''
    pagination = soup_year.find('div', class_='pagination-item-holder')
    nb_pages = int(pagination.find_all('span')[-1].text)
    return int(nb_pages)

def get_title(soup_movie):
    return soup_movie.find('div', class_ = "titlebar-title titlebar-title-xl").text

def get_date_duration_categories(soup_movie):
    elt = soup_movie.find('div', class_="meta-body-item meta-body-info")
    text = elt.get_text(strip=True)
    s1, s2, s3 = text.split('|')
    date = s1[:-8].strip()
    duration = s2.strip()
    categories = s3.strip()
    return date, duration, categories

def get_directors(soup_casting):
    elt_director_section = soup_casting.find('section', class_='section casting-director')
    elt_temp = elt_director_section.find_next()
    # print(elt_temp.prettify())
    elts_directors = elt_temp.find_next_sibling().find_all('div', class_ = 'card person-card person-card-col')
    lst_directors = [elt_director.find('a').text for elt_director in elts_directors]
    return lst_directors

def get_actors(soup_casting):
    elt_actor_section = soup_casting.find('section', class_ = 'section casting-actor')
    if not(elt_actor_section):
        return []
    elt_temp = elt_actor_section.find_next()
    #print(elt_temp.prettify())
    elts_actors = elt_temp.find_next_sibling().find_all('div', class_ = 'card person-card person-card-col')
    lst_actors = [elt_actor.find('figure').find('span')['title'] for elt_actor in elts_actors]
    elts_actors = elt_actor_section.find_all('div', class_ = 'md-table-row')
    #print('Actors:', elts_actors)
    # We complete the actor list to get 10 actors
    nb_complete = 10 - len(lst_actors)
    #lst_actors.extend([elt_actor.find('a').text for elt_actor in elts_actors[:nb_complete] if elt_actor.find('a')])
    return lst_actors

def get_summary(soup_movie):
    elt = soup_movie.find('section', class_ = "section ovw ovw-synopsis")
    return elt.find('p', class_ = 'bo-p').text.strip()

def get_thumbnail(soup_movie):
    elt = soup_movie.find('figure', class_ = 'thumbnail')
    return elt.span.img['src']

def scrap_movie(elt_movie):
    ''' scrap all movie informations '''
    # get soup
    url_movie = url_site + elt_movie.h2.a.get('href')
    r = requests.get(url_movie, auth=('user', 'pass'))
    soup_movie = BeautifulSoup(r.content, 'html.parser')

    # Get title
    title = get_title(soup_movie)
    print("Title:" , get_title(soup_movie))

    # Get date, duration and categories
    date, duration, categories = get_date_duration_categories(soup_movie)
    print("Date:", date)
    print("Duration:", duration)
    print("Categories:", categories)

    # Get directors / Actors
    lst_directors, lst_actors = [], []
    elt_link_casting = soup_movie.find('a', class_ = 'end-section-link')

    if elt_link_casting and 'Casting' in elt_link_casting['title']:
        # if title == 'Le Roi et l\'oiseau':
        #     print(elt_link_casting.prettify())
        link_casting = elt_link_casting['href']
        r = requests.get(url_site + link_casting, auth=('user', 'pass'))
        soup_casting = BeautifulSoup(r.content, 'html.parser')
        # Get directors' list
        lst_directors = get_directors(soup_casting)
        # Get actors' list
        lst_actors = get_actors(soup_casting)

    else: # Animation Case Only ???
        # print(elt_link_casting.prettify())
        elts = soup_movie.find_all('div', class_ = "meta-body-item meta-body-direction meta-body-oneline")
        # Get directors' list
        lst_directors = [elts[0].text.strip()[2:].strip()]

        if len(elts) > 1:
            elts_span = elts[1].find_all('span')
            for elt in elts_span:
                if 'light' in elt['class']:
                    continue
                if elt.get_text(strip=True) not in lst_directors:
                    lst_directors.append(elt.get_text(strip=True))

        # Get actors' list (Not for animation movie)
        # elt = soup_movie.find('div', class_ = "meta-body-item meta-body-actor")
        # lst_actors = elt.get_text(strip=True)[4:]

    print('Directors:', lst_directors)
    print('Actors:', lst_actors)

    # Get summary
    print("Summary:", get_summary(soup_movie))

    # Get thumbnail url
    url_thumbnail = get_thumbnail(soup_movie)
    print("Thumbnail:", url_thumbnail)

    # Save the image
    try:
        folder_name = os.getcwd() + '\\thumbnails\\'
        print(folder_name)
        image_name = f"thumbnail-{title}.jpg"
        file = open(folder_name + image_name, "wb")
        image = httpx.get(url_thumbnail)
        file.write(image.content)
    except IOError:
        print("Cannot read the file")
    # finally:
        # file.close()

    return

    # Get ratings

    # driver to the movie page
    driver.get(url_movie)
    elts_rating = driver.find_elements(By.CLASS_NAME, 'rating-item')
    print(len(elts))
    for elt in elts:
        #print(elt.text)
        elt_a = elt.find_element(By.TAG_NAME, 'a')
        print(elt_a.get_attribute('href'))

    return
    # beautiful soup version
    elts_rating = soupMovie.find_all('div', class_ = 'rating-item')
    #print(len(elts_rating))
    ratings = {}

    for elt_rating in elts_rating:
        print(elt_rating.prettify())
        elt_temp = elt_rating.find('div', class_='rating-item-content')
        elt_span = elt_temp.find('span')
        print("span class", elt_span['class'])
        if 'rating-title' in elt_span['class']:
            print('rating-title')
            print(elt_span.get_text(strip = True))
            if 'Spectateurs' in elt_span.get_text(strip = True):
                ratings['spectateurs'] = elt_temp.find('span', class_ = 'stareval-note').text
        #     # print(elt_span.get_text(strip = True))
        # elif 'rating-title' in elt_span['class']:
        #     if 'Presse' in elt_span.get_text(strip = True):
        #         ratings['Presse'] = elt_temp.find('span', class_ = 'stareval-note').text
    print("ratings:", ratings)


driver.get(url_films)
elts_decades = driver.find_elements(By.XPATH, '/html/body/div[2]/main/section[3]/div[1]/div/div[3]/div[2]/ul/li')

for elt_decade in elts_decades:
    elt_a = elt_decade.find_element(By.TAG_NAME, 'a')

    if elt_a.get_attribute('title')[:4] in decades_to_scrap:
        url_decades = elt_a.get_attribute('href')
        print("decade:", url_decades)
        driver2 = webdriver.Chrome(options = options)
        driver2.get(url_decades)

        elts_years = driver2.find_elements(By.XPATH, '/html/body/div[2]/main/section[3]/div[1]/div/div[3]/div[3]/ul/li')
        #print(len(elts_years))
        
        for elt_year in elts_years[::-1]:
            elt_a_year = elt_year.find_element(By.TAG_NAME, 'a')

            url_year = elt_a_year.get_attribute('href')
            print("year:", url_year)

            r = requests.get(url_year, auth=('user', 'pass'))
            if r.status_code != 200:
                print("url_site error")

            # We get the number of pages for this year
            soup_year = BeautifulSoup(r.content, 'html.parser')
            nb_pages = number_pages_per_year(soup_year)
            # print('Nb pages:', nb_pages)

            for i in range(nb_pages): # Need to reduce as some movies are totaly unknown with very few informations about
                url_year_page = url_year + f'?page={i+1}'
                # print(url_year_page)
                r = requests.get(url_year_page, auth=('user', 'pass'))
                if r.status_code != 200:
                    print("url_site error")
                soup_movies = BeautifulSoup(r.content, 'html.parser')
                elt_movies = soup_movies.find_all('li', class_='mdl')
                #print("nb movies per page:", len(elt_movies))
                for elt_movie in elt_movies[:1]:
                    scrap_movie(elt_movie)
                    # break
                break
            break

decade: https://www.allocine.fr/films/decennie-1980/
year: https://www.allocine.fr/films/decennie-1980/annee-1980/
Title: La Boum
Date: 17 décembre 1980
Duration: 1h 49min
Categories: Comédie,Drame,Romance
Directors: ['Claude Pinoteau']
Actors: ['Sophie Marceau', 'Brigitte Fossey', 'Claude Brasseur', 'Denise Grey', "Sheila O'Connor", 'Bernard Giraudeau', 'Dominique Lavanant', 'Jean-Pierre Castaldi']
Summary: Vic vit tranquillement entre le lycée, ses parents et Poupette, son arrière-grand-mère. Lorsque sa mère apprend l'existence d'une ancienne maîtresse de son mari, elle décide de "faire un break" mais du haut de ses 13 ans Vic ne pense qu'à sa première boum...
Thumbnail: https://fr.web.img3.acsta.net/c_310_420/medias/nmedia/18/62/90/68/18658418.jpg
c:\Users\Utilisateur\Documents\Block1\thumbnails\


### Ou bien nous pouvons entrer les url manuellement

In [4]:
url_decades = url_films + 'decennie-1980/'
url_year = url_films + 'decennie-1980/annee-1980/'

def getNumberOfPages(elt):
    nb = 0
    while elt:
        if elt.text.isdigit():
            nb = elt.text
        elt = elt.find_next_sibling()
    return int(nb)

r = requests.get(url_year, auth=('user', 'pass'))
if r.status_code != 200:
    print("url_site error")

# We get the number of pages for this year
soup = BeautifulSoup(r.content, 'html.parser')
pagination = soup.find('div', class_='pagination-item-holder')
nb_pages = int(pagination.find_all('span')[-1].text)
assert nb_pages == getNumberOfPages(pagination.find('span'))
print(nb_pages)

47


### On scrape chaque page de films pour une année donnée

In [1]:
def scrapMoviePage(url_movie):
    # get soup from movie page
    r = requests.get(url_movie, auth=('user', 'pass'))
    soupMovie = BeautifulSoup(r.content, 'html.parser')
    print("Title:" , soupMovie.find('div', class_ = "titlebar-title titlebar-title-xl").text)
    
    elt = soupMovie.find('div', class_="meta-body-item meta-body-info")
    text = elt.get_text(strip=True)

    s1, s2, s3 = text.split('|')
    date = s1[:-8].strip()
    print("Date:", date)
    duration = s2.strip()
    print("Duration:", duration)
    categories = s3.strip()
    print("Categories:", categories)

    elts = soupMovie.find_all('div', class_ = "meta-body-item meta-body-direction meta-body-oneline")
    #assert len(elts)
    authors = [elts[0].text.strip()[2:].strip()]

    if len(elts) > 1:
        elts_span = elts[1].find_all('span')
        for elt in elts_span:
            if 'light' in elt['class']:
                continue
            if elt.get_text(strip=True) not in authors:
                authors.append(elt.get_text(strip=True))
    authors = ', '.join(authors)
    print("Authors:", authors)

    elt = soupMovie.find('div', class_ = "meta-body-item meta-body-actor")
    actors = elt.get_text(strip=True)[4:]
    print("Actors:", actors)

    elt = soupMovie.find('section', class_ = "section ovw ovw-synopsis")
    elt2 = elt.find('div', class_ = "content-txt")
    elt3 = elt.find('p', class_ = 'bo-p')
    summary = elt3.text.strip()
    print("Summary:", summary)

    # driver to the movie page
    driver.get(url_movie)
    elts_rating = driver.find_elements(By.CLASS_NAME, 'rating-item')
    print(len(elts))
    for elt in elts:
        #print(elt.text)
        elt_a = elt.find_element(By.TAG_NAME, 'a')
        print(elt_a.get_attribute('href'))

    return
    # beautiful soup version
    elts_rating = soupMovie.find_all('div', class_ = 'rating-item')
    #print(len(elts_rating))
    ratings = {}

    for elt_rating in elts_rating:
        print(elt_rating.prettify())
        elt_temp = elt_rating.find('div', class_='rating-item-content')
        elt_span = elt_temp.find('span')
        print("span class", elt_span['class'])
        if 'rating-title' in elt_span['class']:
            print('rating-title')
            print(elt_span.get_text(strip = True))
            if 'Spectateurs' in elt_span.get_text(strip = True):
                ratings['spectateurs'] = elt_temp.find('span', class_ = 'stareval-note').text
        #     # print(elt_span.get_text(strip = True))
        # elif 'rating-title' in elt_span['class']:
        #     if 'Presse' in elt_span.get_text(strip = True):
        #         ratings['Presse'] = elt_temp.find('span', class_ = 'stareval-note').text
    print("ratings:", ratings)

movies = []

for i in range(1, 3):
    url_year_page = url_year + '?page=' + str(i)
    r = requests.get(url_year_page, auth=('user', 'pass'))
    if r.status_code != 200:
        print("url_site error")
    soupMovies = BeautifulSoup(r.content, 'html.parser')
    eltMovies = soupMovies.find_all('li', class_='mdl')

    for eltMovie in eltMovies:
        url_movie = url_site + eltMovie.h2.a.get('href')
        scrapMoviePage(url_movie)
    #    print()
    #break

NameError: name 'url_year' is not defined

### On scrape les pays

In [None]:
eltCountries = eltYears.find_next_sibling()
print(eltCountries)

In [25]:

driver.get("https://www.allocine.fr/film/fichefilm_gen_cfilm=4403.html")

# elem = driver.find_element(By.NAME, "q")
# elem.clear()
# elem.send_keys("pycon")
# elem.send_keys(Keys.RETURN)


# elts = driver.find_elements(By.CLASS_NAME, 'xXx rating-title')
elts = driver.find_elements(By.CLASS_NAME, 'rating-item')
print(len(elts))
for elt in elts[:1]:
    print(elt.text)
    elt_a = elt.find_element(By.TAG_NAME, 'a')
    print(elt_a.get_attribute('href'))

#elts_rating = elt.find_elements(By.XPATH, '/html/body/div[2]/main/section/div/div[3]/div[2]/div')
# for elt in elts_rating:
#     elt_temp = elt.find_element(By.CLASS_NAME, 'rating-item-content')
#     elt_a = driver.find_element(By.TAG_NAME, 'a')
    # elt_a = elt_temp.find_element(By.CLASS_NAME, 'xXx')
    # elt_a = elt_temp.find_element(By.CLASS_NAME, 'rating-title')
    # elt_a = elt.find_element(By.CLASS_NAME, 'xXx rating-title')
    # print(elt_a.get_attribute('href'))

2
Spectateurs
3,0
18413 notes, 225 critiques
https://www.allocine.fr/film/fichefilm-4403/critiques/spectateurs/


In [None]:
driver.close()