In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### In case there are many tables in the page

In [4]:
url = "https://pt.wikipedia.org/wiki/Lista_de_bairros_de_Manaus"
content = requests.get(url).text  #download the web page contents
soup = BeautifulSoup(content, 'html.parser')
tables = soup.find_all('table')

In [5]:
[obj.get("class") for obj in tables] #show all table classes

[['box-Desatualizado', 'plainlinks', 'metadata', 'ambox', 'ambox-content'],
 ['wikitable', 'sortable'],
 ['nowraplinks', 'collapsible', 'collapsed', 'navbox-inner']]

In [11]:
table = soup.find('table', class_='wikitable sortable')  # Create object of our table accoring to its HTML_class
df = pd.DataFrame(columns=['Neighborhood', 'Zone', 'Area', 'Population', 'Density', 'Homes_count'])

<img src="../json_api_requests_bs4/table.JPG">

In [12]:
for row in table.tbody.find_all('tr'):    
    columns = row.find_all('td')
    
    if(columns != []):
        neighborhood = columns[0].text.strip()
        zone = columns[1].text.strip()
        area = columns[2].span.contents[0].strip('&0.')
        population = columns[3].span.contents[0].strip('&0.')
        density = columns[4].span.contents[0].strip('&0.')
        homes_count = columns[5].span.contents[0].strip('&0.')

        df = df.append({'Neighborhood': neighborhood,  'Zone': zone, 
                        'Area': area, 'Population': population, 'Density': density, 'Homes_count': homes_count}, ignore_index=True)
df.head()

Unnamed: 0,Neighborhood,Zone,Area,Population,Density,Homes_count
0,Adrianópolis,Centro-Sul,248.45,10459,3560.88,3224
1,Aleixo,Centro-Sul,618.34,24417,3340.4,6101
2,Alvorada,Centro-Oeste,553.18,76392,11681.73,18193
3,Armando Mendes,Leste,307.65,33441,9194.86,7402
4,Betânia,Sul,52.51,1294,20845.55,3119


### Pandas fast way

In [13]:
df_pandas = pd.read_html(url, attrs = {'class': 'wikitable sortable'},  flavor='bs4', thousands ='.')
df_pandas[0].head()

Unnamed: 0,Bairro[2],Zona administrativa,Área (ha)[2],População (estimativa 2017)[2],Densidade Demográfica (hab./km²),Domicílios particulares[2]
0,Adrianópolis,Centro-Sul,24845,10 459,"3 560,88",3 224
1,Aleixo,Centro-Sul,61834,24 417,"3 340,40",6 101
2,Alvorada,Centro-Oeste,55318,76 392,"11 681,73",18 193
3,Armando Mendes,Leste,30765,33 441,"9 194,86",7 402
4,Betânia,Sul,5251,12 940,"20 845,55",3 119


### Second Case: the data is grouped together under one column that indicates the postal code

In [15]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
content = requests.get(url).text
soup = BeautifulSoup(content, 'html5lib')
table = soup.find('table') # There is just one table in this page

In [16]:
contents = []

# reject empty columns and extract the contents from the paragraph 'p' and the span
for row in table.find_all('td'):
    cell = {}
    if row.span.text == 'Not assigned':  # ignore empty column
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        contents.append(cell)

In [17]:
df = pd.DataFrame(contents)
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [18]:
# Changing some values to more comprehensive names
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                     'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                     'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                     'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto Business,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


## Resource: https://medium.com/geekculture/web-scraping-tables-in-python-using-beautiful-soup-8bbc31c5803e

## NEW Amazon Resource: https://www.datacamp.com/community/tutorials/amazon-web-scraping-using-beautifulsoup