## Retrieving Data from the Web

#### Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint 

#### Get the page to BeautifulSoup

In [2]:
url = "https://en.wikipedia.org/wiki/Madison,_Wisconsin"
res = requests.get(url)

parsed = BeautifulSoup(res.text)

#### Some operations with BeautifulSoup

In [3]:
# get a title of a page
parsed.title

# get a name of a tag
parsed.title.name

# get a value of a tag
parsed.title.string

# find all 'a' tags
parsed.find_all(['a', 'b'])

# or using regex (finds all tags whose name match the given regex)
parsed.find_all(re.compile(r'^b'))

# or use a function
def has_class_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

parsed.find_all(has_class_no_id)

# get links of all 'a' tags
for link in parsed.find_all('a'):
    if link.get('href') is not None:
        if 'https' in link.get('href'):
            pass
            #print(link.get('href'))
           
        
# get first element which has a tag 'p' and get its class
parsed.p['class']

# get iterator of children of table
parsed.table.children

# get generator of descendants of table
next(parsed.table.descendants)

# get a parent
parsed.p.parent

# get a tree structure of parents
for parent in parsed.p.parent:
    pass
    #print(parent.name)
    
# get siblings
parsed.p.previous_sibling
parsed.p.next_sibling

# get text
pprint(parsed.table.get_text())

('Madison, WisconsinState capital cityA view of the skyline of the Madison '
 'Isthmus and Lake Mendota from Picnic Point\n'
 'FlagSealLogoNickname(s):\xa0"Madtown", "Mad City", "The City of Four Lakes", '
 '"77 Square Miles Surrounded by Reality"[1]Location of Madison in Dane '
 'County, Wisconsin.Madison, WisconsinLocation in Wisconsin, United States, '
 'and North AmericaShow map of WisconsinMadison, WisconsinMadison, Wisconsin '
 '(the United States)Show map of the United StatesMadison, WisconsinMadison, '
 'Wisconsin (North America)Show map of North AmericaCoordinates: 43°04′29″N '
 '89°23′03″W\ufeff / \ufeff43.07472°N 89.38417°W\ufeff / 43.07472; '
 '-89.38417Coordinates: 43°04′29″N 89°23′03″W\ufeff / \ufeff43.07472°N '
 '89.38417°W\ufeff / 43.07472; -89.38417CountryUnited '
 'StatesStateWisconsinCountyDaneMunicipalityCityFounded1836Chartered1846Incorporated1856Named '
 'forJames MadisonGovernment\xa0•\xa0MayorSatya Rhodes-Conway (D)\xa0•\xa0'
 'BodyMadison Common CouncilArea[2]\

#### Find a table of temperature

In [4]:
def get_temp_table(tag): 
    return tag.name == 'table' and "Climate data for" in tag.get_text()
    
temp_table = parsed.find_all(get_temp_table)[0]
children_num = sum(1 for _ in temp_table.children)
parents_num = sum(1 for _ in temp_table.parents)

print("Table title: ", temp_table.title)
print("Number of chidren of this table: ", children_num)
print("Number of parents of this table: ", parents_num)
print("Children's tags: ", [child.name for child in temp_table.children])

Table title:  None
Number of chidren of this table:  2
Number of parents of this table:  8
Children's tags:  [None, 'tbody']


#### Get the temperature as a table

In [5]:
def retrieve_climate_table(url):
    '''
        Function takes an url and returns a table that correspond to temperature of a specific city
    '''
    
    try:
        req = requests.get(url)
        parsed = BeautifulSoup(req.text)
    except:
        raise ValueError('Wrong url')
    
    # if not found
    if str(req.status_code)[0] != '2':
        print("Error code: ", req.status_code)
        return None

    if len(parsed.find_all(get_temp_table)) == 0:
        # no table which starts as "Climate data for" on the page
        return None

    for table in parsed.find_all(get_temp_table):
        # get the first row and check if there is a "Record high"
        if "Record high" in table.contents[1].contents[4].contents[1].get_text():
            return table
    
    # nothing has been found
    return None

retrieve_climate_table('https://en.wikipedia.org/wiki/Los_Angeles').contents[1].contents[4]

<tr style="text-align: center;">
<th scope="row" style="height: 16px;">Record high °F (°C)
</th>
<td style="background: #FF2C00; color:#000000;">95<br/>(35)
</td>
<td style="background: #FF2C00; color:#000000;">95<br/>(35)
</td>
<td style="background: #FF1F00; color:#000000;">99<br/>(37)
</td>
<td style="background: #FF0300; color:#FFFFFF;">106<br/>(41)
</td>
<td style="background: #FF1100; color:#FFFFFF;">103<br/>(39)
</td>
<td style="background: #DC0000; color:#FFFFFF;">112<br/>(44)
</td>
<td style="background: #EA0000; color:#FFFFFF;">109<br/>(43)
</td>
<td style="background: #FF0300; color:#FFFFFF;">106<br/>(41)
</td>
<td style="background: #CE0000; color:#FFFFFF;">113<br/>(45)
</td>
<td style="background: #F80000; color:#FFFFFF;">108<br/>(42)
</td>
<td style="background: #FF1800; color:#FFFFFF;">100<br/>(38)
</td>
<td style="background: #FF3A00; color:#000000;">92<br/>(33)
</td>
<td style="background: #CE0000; color:#FFFFFF; border-left-width:medium">113<br/>(45)
</td></tr>

#### Retruns a list of row names of a "Climate data for" table

In [6]:
def list_climate_table_row_names(url):
    '''
        Function takes an url and returns a list of row names of a "Climate data for" table
    '''
    
    # use previous function to get the table
    table = retrieve_climate_table(url)
    
    # if no table was found
    if table == None:
        return None
    
    list_of_row_names = list()

    # iterate over all rows
    for row in table.contents[1].contents[4:]:
        # skip the separator
        if row != '\n' and (not "Source" in row.get_text()):
            list_of_row_names.append(row.contents[1].get_text().rstrip())

    return list_of_row_names

list_climate_table_row_names('https://en.wikipedia.org/wiki/Boston')

['Record high °F (°C)',
 'Mean maximum °F (°C)',
 'Average high °F (°C)',
 'Daily mean °F (°C)',
 'Average low °F (°C)',
 'Mean minimum °F (°C)',
 'Record low °F (°C)',
 'Average precipitation inches (mm)',
 'Average snowfall inches (cm)',
 'Average precipitation days (≥ 0.01 in)',
 'Average snowy days (≥ 0.1 in)',
 'Average relative humidity (%)',
 'Average dew point °F (°C)',
 'Mean monthly sunshine hours',
 'Percent possible sunshine',
 'Average ultraviolet index']

#### Get the value of a specified row name from a "Climate data for" table

In [7]:
def get_value_from_row(url, row_name, column_name):
    '''
        Function takes an url, a row name and a column name and returns a value of a specified row from 
        a "Climate data for" table, which is taken from url
        
        Column name is the months from Jan to Dec and a Year
    '''
    
    column_name_dic = {
        'Jan': 3,
        'Feb': 5,
        'Mar': 7,
        'Apr': 9,
        'May': 11,
        'Jun': 13,
        'Jul': 15,
        'Aug': 17,
        'Sep': 19,
        'Oct': 21,
        'Nov': 23,
        'Dec': 25,
        'Year': 27,
    }
    
    # use previous function to get the table
    table = retrieve_climate_table(url)
    
    # if no table was found
    if table == None:
        return None

    # iterate over all rows
    for row in table.contents[1].contents[4:]:
        # skip the separator
        if row != '\n' and (not "Source" in row.get_text()):
            if (row.contents[1].get_text().rstrip() == row_name):
                # found a row name. Return the value
                return row.contents[column_name_dic[column_name]].get_text().rstrip()

    # nothing has been found
    return 'No column name'

wiki_url = 'https://en.wikipedia.org/wiki/'
print(get_value_from_row(wiki_url + 'Winnipeg', 'Daily mean °C (°F)', 'Dec'))
print(get_value_from_row(wiki_url + 'Warsaw', 'Daily mean °C (°F)', 'Dec'))
print(get_value_from_row(wiki_url + 'Los_Angeles', 'Daily mean °F (°C)', 'Dec'))

−13.2(8.2)
−0.7(30.7)
57.8(14.3)
