In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>



In [5]:
soup.find('p')

<p>Here is some simple content for this page.</p>

In [6]:
soup.find_all('p')

[<p>Here is some simple content for this page.</p>]

In [7]:
soup.find('p').get_text()

'Here is some simple content for this page.'

In [8]:
soup.find('p').get_text().strip()

'Here is some simple content for this page.'

In [9]:
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [10]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content)
soup.find_all('p')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>,
 <p class="inner-text">
                 Second paragraph.
             </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [11]:
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>



In [12]:
soup.find_all('p', {"class":"outer-text"})[0].get_text().strip()

'First outer paragraph.'

In [13]:
soup.find_all('p', {"class":"outer-text", "id":"second"})[0].get_text().strip()

'First outer paragraph.'

In [14]:
soup.find(id="first")

<p class="inner-text first-item" id="first">
                First paragraph.
            </p>

In [15]:
wikipedia_DS_url = "https://fr.wikipedia.org/wiki/Science_des_donn%C3%A9es"
wiki_raw = requests.get(wikipedia_DS_url)
soup = BeautifulSoup(wiki_raw.content)
str(soup)[:1000]

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-night-mode-clientpref-0 vector-toc-available" dir="ltr" lang="fr">\n<head>\n<meta charset="utf-8"/>\n<title>Science des données — Wikipédia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-dis

In [16]:
main_soup = soup.find("main")  # pas besoin de find_all car il n'y a qu'un main
str(main_soup)[:1000]

'<main class="mw-body" id="content" role="main">\n<header class="mw-body-header vector-page-titlebar">\n<nav aria-label="Sommaire" class="vector-toc-landmark" role="navigation">\n<div class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" id="vector-page-titlebar-toc">\n<input aria-haspopup="true" aria-label="Basculer la table des matières" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-page-titlebar-toc" id="vector-page-titlebar-toc-checkbox" role="button" type="checkbox"/>\n<label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" for="vector-page-titlebar-toc-checkbox" id="vector-page-titlebar-toc-label"><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span>\n<span class="vector-dropdown-label-text">Basculer la table des matières</span>\n</label>\n<div class="vector-dropdown-content">\n<div class="v

In [18]:
links = main_soup.find_all("a")

In [19]:
links = main_soup.find_all("a", href=True, title=True)
links[:10]

[<a class="interlanguage-link-target" href="https://ar.wikipedia.org/wiki/%D8%B9%D9%84%D9%85_%D8%A7%D9%84%D8%A8%D9%8A%D8%A7%D9%86%D8%A7%D8%AA" hreflang="ar" lang="ar" title="علم البيانات – arabe"><span>العربية</span></a>,
 <a class="interlanguage-link-target" href="https://az.wikipedia.org/wiki/Veril%C9%99nl%C9%99r_elmi" hreflang="az" lang="az" title="Verilənlər elmi – azerbaïdjanais"><span>Azərbaycanca</span></a>,
 <a class="interlanguage-link-target" href="https://bg.wikipedia.org/wiki/%D0%9D%D0%B0%D1%83%D0%BA%D0%B0_%D0%B7%D0%B0_%D0%B4%D0%B0%D0%BD%D0%BD%D0%B8%D1%82%D0%B5" hreflang="bg" lang="bg" title="Наука за данните – bulgare"><span>Български</span></a>,
 <a class="interlanguage-link-target" href="https://bn.wikipedia.org/wiki/%E0%A6%89%E0%A6%AA%E0%A6%BE%E0%A6%A4%E0%A7%8D%E0%A6%A4_%E0%A6%AC%E0%A6%BF%E0%A6%9C%E0%A7%8D%E0%A6%9E%E0%A6%BE%E0%A6%A8" hreflang="bn" lang="bn" title="উপাত্ত বিজ্ঞান – bengali"><span>বাংলা</span></a>,
 <a class="interlanguage-link-target" href="https://ca.wi

In [20]:
'Bonjour'.startswith("a")

False

In [21]:
links = main_soup.find_all("a", href=lambda link: link and link.startswith("/wiki/"), title=True)

In [22]:
print(links[:5])

[<a accesskey="c" href="/wiki/Science_des_donn%C3%A9es" title="Voir le contenu de la page [c]"><span>Article</span></a>, <a accesskey="t" href="/wiki/Discussion:Science_des_donn%C3%A9es" rel="discussion" title="Discussion au sujet de cette page de contenu [t]"><span>Discussion</span></a>, <a accesskey="j" href="/wiki/Sp%C3%A9cial:Pages_li%C3%A9es/Science_des_donn%C3%A9es" title="Liste des pages liées qui pointent sur celle-ci [j]"><span>Pages liées</span></a>, <a accesskey="k" href="/wiki/Sp%C3%A9cial:Suivi_des_liens/Science_des_donn%C3%A9es" rel="nofollow" title="Liste des modifications récentes des pages appelées par celle-ci [k]"><span>Suivi des pages liées</span></a>, <a accesskey="u" href="/wiki/Aide:Importer_un_fichier" title="Téléverser des fichiers [u]"><span>Téléverser un fichier</span></a>]


In [24]:
liens = []
for l in links:
    liens.append(l['href'])

In [25]:
liens[:5]

['/wiki/Science_des_donn%C3%A9es',
 '/wiki/Discussion:Science_des_donn%C3%A9es',
 '/wiki/Sp%C3%A9cial:Pages_li%C3%A9es/Science_des_donn%C3%A9es',
 '/wiki/Sp%C3%A9cial:Suivi_des_liens/Science_des_donn%C3%A9es',
 '/wiki/Aide:Importer_un_fichier']

In [33]:
from urllib.parse import unquote


decoded_urls = [unquote(url) for url in liens]

for i,url in enumerate(decoded_urls):
    print(url)
    if i==4: break

/wiki/Science_des_données
/wiki/Discussion:Science_des_données
/wiki/Spécial:Pages_liées/Science_des_données
/wiki/Spécial:Suivi_des_liens/Science_des_données
/wiki/Aide:Importer_un_fichier
