## 1. Get webpage using *requests*

In [112]:
import requests

req = requests.get('https://en.wikipedia.org/wiki/Supervised_learning')

In [113]:
req

<Response [200]>

In [114]:
webpage = req.text

In [115]:
print(webpage)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Supervised learning - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=document.cookie.m

## 2. Get specific contents using BeatifulSoup

In [116]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(webpage, 'html.parser')

### 2.1 Prettify the webpage

In [117]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Supervised learning - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie

### 2.2 Get the first paragraph 

You can try to remove "attrs" to see how it works.

In [118]:
paragraphs = soup.find_all('p', attrs={"class":False})

In [119]:
paragraph = paragraphs[1]

In [120]:
paragraph

<p><b>Supervised learning (SL)</b> is a <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a> paradigm for problems where the available data consists of labelled examples, meaning that each data point contains features (covariates) and an associated label. The goal of supervised learning algorithms is learning a function that <a href="/wiki/Map_(mathematics)" title="Map (mathematics)">maps</a> feature vectors (inputs) to labels (output), based on example input-output pairs.<sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup> It infers a function from <i><style data-mw-deduplicate="TemplateStyles:r1023754711">.mw-parser-output .vanchor>:target~.vanchor-text{background-color:#b1d2ff}</style><span class="vanchor"><span id="labeled_[[training_set|training_data]]"></span><span id="LABELLED_DATA"></span><span class="vanchor-text">labeled <a class="mw-redirect" href="/wiki/Training_set" title="Training set">training data</a></span></span></i> co

### 2.3 Get all the links in this paragraph which point to other webpages

In [121]:
data = {"title":[], "href":[]}
for link in paragraph.find_all('a', attrs={"title":True}):
    data["title"].append(link["title"])
    data["href"].append(link["href"])

In [122]:
import pandas as pd
df = pd.DataFrame(data)

In [123]:
df

Unnamed: 0,title,href
0,Machine learning,/wiki/Machine_learning
1,Map (mathematics),/wiki/Map_(mathematics)
2,Training set,/wiki/Training_set
3,Inductive bias,/wiki/Inductive_bias
4,Generalization error,/wiki/Generalization_error


## 3. Get the contents from all the webpages

In [124]:
webpages = []
head = "https://en.wikipedia.org"
for href in data["href"]:
    link = head + href
    req = requests.get(link)
    webpage = req.text
    webpages.append(webpage)

In [125]:
len(webpages)

5

## 4. Repeat the processes to get more data

In [126]:
for webpage in webpages:
    soup = BeautifulSoup(webpage, 'html.parser')
    paragraphs = soup.find_all('p', attrs={"class":False})
    paragraph = paragraphs[1]
    for link in paragraph.find_all('a', attrs={"title":True}):
        data["title"].append(link["title"])
        data["href"].append(link["href"])

In [127]:
webpages = []
head = "https://en.wikipedia.org"
for href in data["href"]:
    link = head + href
    req = requests.get(link)
    webpage = req.text
    webpages.append(webpage)

In [128]:
for webpage in webpages:
    soup = BeautifulSoup(webpage, 'html.parser')
    paragraphs = soup.find_all('p', attrs={"class":False})
    paragraph = paragraphs[1]
    for link in paragraph.find_all('a', attrs={"title":True}):
        data["title"].append(link["title"])
        data["href"].append(link["href"])

In [129]:
df = pd.DataFrame(data)

In [130]:
df

Unnamed: 0,title,href
0,Machine learning,/wiki/Machine_learning
1,Map (mathematics),/wiki/Map_(mathematics)
2,Training set,/wiki/Training_set
3,Inductive bias,/wiki/Inductive_bias
4,Generalization error,/wiki/Generalization_error
...,...,...
156,Automated reasoning,/wiki/Automated_reasoning
157,Automation,/wiki/Automation
158,Alan Turing,/wiki/Alan_Turing
159,Skill,/wiki/Skill


## 5. Save df to csv

In [18]:
df.to_csv('Sample_data.csv', index=False)