## Data Scraping using beautiful soup
- import beautiful soup
- Make a GET request to fetch page data
- Parse HTML
- Filter relevant paths

In [22]:
# !pip install bs4

In [23]:
import requests
from urllib.request import urlopen

requests.packages.urllib3.disable_warnings()
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

In [24]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

In [25]:
android_data = urlopen(android_url)
print(type(android_data))

<class 'http.client.HTTPResponse'>


In [26]:
android_html = android_data.read()
android_data.close() # Close the response after receiving the data

In [27]:
# android_html # Entire html of wiki

## 2. parsing data



In [28]:
from bs4 import BeautifulSoup as soup

In [29]:
android_soup = soup(android_html,'html.parser')

In [30]:
# print(android_soup)

In [31]:
print(type(android_soup))

<class 'bs4.BeautifulSoup'>


In [32]:
android_soup.h1

<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>

In [33]:
android_soup.find_all('h1',{})

[<h1 class="firstHeading" id="firstHeading" lang="en">Android version history</h1>]

In [34]:
tables = android_soup.find_all('table',{'class' :'wikitable'})

In [35]:
len(tables)

31

In [36]:
android_table = tables[0]

In [37]:
android_table

<table class="wikitable">
<tbody><tr>
<th>Name
</th>
<th>Version number(s)
</th>
<th>Initial stable<br/>release date
</th>
<th>Supported (security fixes)
</th>
<th>API level
</th>
<th>References
</th></tr>
<tr>
<td rowspan="2">No official codename
</td>
<td>1.0
</td>
<td>September 23, 2008
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup>
</td></tr>
<tr>
<td>1.1
</td>
<td>February 9, 2009
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>2
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-2"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[14]</a></sup>
</td></tr>
<tr>
<td><a href="/wiki/Android_Cupcake" ti

## 3. Extracting Useful information
- Remove undesired tags
- Extract table header and data

In [38]:
headers = android_table.find_all('th')

In [39]:
print(len(headers))

6


In [40]:
print(headers[0].text)

Name



In [41]:
column_titles = [ct.text[:-1] for ct in headers]

In [42]:
column_titles

['Name',
 'Version number(s)',
 'Initial stablerelease date',
 'Supported (security fixes)',
 'API level',
 'References']

In [43]:
rows_data = android_table.find_all('tr',{})[1:]

In [44]:
print(len(rows_data))

18


In [52]:
# first_row = rows_data[0].find_all('td',{})
# for d in first_row:
#     print(d.text)

<tr>
<td>1.1
</td>
<td>February 9, 2009
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>2
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-2"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-14"><a href="#cite_note-14">[14]</a></sup>
</td></tr>

In [46]:
table_row = []
for row in rows_data:
    current_row = []
    row_data = row.find_all('td',{})
    for idx, data in enumerate(row_data):
        if idx == 1:
            current_row.append(data.text[:-1].split(': ')[-1])
        elif idx != 0 and idx != 3:
            current_row.append(data.text[:-1])
        else:
            current_row.append(data.text[:-1])

    table_row.append(current_row)

In [47]:
table_row

[['No official codename', '1.0', 'September 23, 2008', 'No', '1', '[9]'],
 ['1.1', 'February 9, 2009', 'No', '2', '[9][14]'],
 ['Cupcake', '1.5', 'April 27, 2009', 'No', '3', '[15]'],
 ['Donut', '1.6', 'September 15, 2009', 'No', '4', '[16]'],
 ['Eclair', '2.0 – 2.1', 'October 26, 2009', 'No', '5 – 7', '[17]'],
 ['Froyo', '2.2 – 2.2.3', 'May 20, 2010', 'No', '8', '[18]'],
 ['Gingerbread', '2.3 – 2.3.7', 'December 6, 2010', 'No', '9 – 10', '[19]'],
 ['Honeycomb', '3.0 – 3.2.6', 'February 22, 2011', 'No', '11 – 13', '[20]'],
 ['Ice Cream Sandwich',
  '4.0 – 4.0.4',
  'October 18, 2011',
  'No',
  '14 – 15',
  '[21]'],
 ['Jelly Bean', '4.1 – 4.3.1', 'July 9, 2012', 'No', '16 – 18', '[22]'],
 ['KitKat', '4.4 – 4.4.4', 'October 31, 2013', 'No', '19 – 20', '[23]'],
 ['Lollipop', '5.0 – 5.1.1', 'November 12, 2014', 'No', '21 – 22', '[24]'],
 ['Marshmallow', '6.0 – 6.0.1', 'October 5, 2015', 'No', '23', '[25]'],
 ['Nougat',
  '7.0 – 7.1.2',
  'August 22, 2016',
  'No',
  '24 – 25',
  '[26][27]

## Writing and reading csv
- CSV stands for comma separated file

In [48]:
filename = 'android_version_history1.csv'
with open(filename,'w') as f:
    # Write the header
    header_string = ','.join(column_titles)
    header_string +='\n'
    f.write(header_string)

    for row in table_row[:-1]:
        row_string = ','.join(row)
        row_string += '\n'
        f.write(row_string)

## Cleaning_data
- Remove unwanted commas and symbols
- Undesired information

In [49]:
import pandas as pd

In [50]:
df = pd.read_csv('android_version_history1.csv')

In [51]:
df

Unnamed: 0,Name,Version number(s),Initial stablerelease date,Supported (security fixes),API level,References
No official codename,1.0,September 23,2008,No,1,[9]
1.1,February 9,2009,No,2,[9][14],
Cupcake,1.5,April 27,2009,No,3,[15]
Donut,1.6,September 15,2009,No,4,[16]
Eclair,2.0 – 2.1,October 26,2009,No,5 – 7,[17]
Froyo,2.2 – 2.2.3,May 20,2010,No,8,[18]
Gingerbread,2.3 – 2.3.7,December 6,2010,No,9 – 10,[19]
Honeycomb,3.0 – 3.2.6,February 22,2011,No,11 – 13,[20]
Ice Cream Sandwich,4.0 – 4.0.4,October 18,2011,No,14 – 15,[21]
Jelly Bean,4.1 – 4.3.1,July 9,2012,No,16 – 18,[22]


## Loading Local files

In [53]:
with open('android.html',encoding='utf-8') as f:
    page_soup = soup(f,'html.parser')

FileNotFoundError: [Errno 2] No such file or directory: 'android.html'

In [54]:
page_soup.find_all('h1')

NameError: name 'page_soup' is not defined

In [55]:
page_soup.find_all('table')

NameError: name 'page_soup' is not defined