# Data Scraping using Beautiful soup

- Import Beautiful Soup
- Make a GET request to fetch Page Data
- Parse HTML
- Filter Relvant parts

#### Installation

In [3]:
# !pip install bs4

In [4]:
from urllib.request import urlopen

In [5]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

In [6]:
android_data = urlopen(android_url)
print(type(android_data))

<class 'http.client.HTTPResponse'>


In [None]:
android_html = android_data.read()
print(android_html)

In [8]:
android_data.close()

### 2. Parsing Data

In [9]:
from bs4 import BeautifulSoup as soup

In [None]:
android_soup = soup(android_html, 'html.parser')
print(android_soup)

In [11]:
print(type(android_soup))

<class 'bs4.BeautifulSoup'>


In [12]:
# findAll(parameter1, parameter2)
# parameter1 -> select the HTML tag
# parameter2 -> select the class and the id for that HTML tag
android_soup.findAll('h1', {})

[<h1 class="firstHeading mw-first-heading" id="firstHeading">Android version history</h1>]

In [13]:
tables = android_soup.findAll('table', {'class' : 'wikitable'})
print(len(tables))

33


In [None]:
android_table = tables[0]
print(android_table)

### 3. Extracting Useful Information
- Remove undesired tags
- Extract table header & data

In [15]:
headers = android_table.findAll('th')
print(len(headers))

7


In [23]:
column_titles = [ct.text[:-1] for ct in headers]
print(column_titles)

['Name', 'Internal codename[9]', 'Version number(s)', 'APIlevel', 'Initial stablerelease date', 'Latest security patch date[14]', 'Latest Google Play Services version[15](release date)']


In [30]:
row_data = android_table.findAll('tr')[1:]
# print(len(row_data))
first_row = row_data[0].findAll('td', {})
for d in first_row:
    print(d.text[:-1])

34
Android 1.0
—
Old version, no longer maintained: 1.0
1
September 23, 2008
—
—


In [34]:
table_rows = []
for row in row_data:
    current_row = []
    row_data = row.findAll('td', {})
    for idx, data in enumerate(row_data):
        if idx != 0 and idx != 3:
            current_row.append(data.text[:-1])
        else:
            current_row.append(data.text)
    table_rows.append(current_row)

In [35]:
print(table_rows)

[]
