In [1]:
# import libraries
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import pandas as pd
import re
import Wrangler as Wr
## Wr.extractLines
## Wr.multipleReplace
## Wr.makeDirFile

def importBerkeleyData(file_url):
    # Access URL, then request and read temperature data file 
    with urllib.request.urlopen(file_url) as fhand:
        # Decode file data and form a list
        file = [line.decode('latin-1').strip() for line in fhand]
    
    # Extract country name
    countryName = re.sub('%','',file[4]).strip()

    # Identify Index at which temperature data begins
    for c,el in enumerate(file):
        if len(el) == 0:
            break

    # Extract data
    lst = Wr.extractLines(file[c+1:])

    # Parse main headers and sub headers into respective lists
    h1 = Wr.multipleReplace({'%':'','-y':'Y'},file[c-2]).split()
    h2 = re.sub('[%\s\.]*','',file[c-1]).split(',')

    # Consolidate main and sub headers into single header list
    h_list = [h1[num//2-1] + h2[num] if num > 1 else h2[num] for num in range(len(h2))]
   
    # Extract mean 1950-1981 mean air temperature used as center of anomalies
    centerMean = float(re.sub('^.+?:','',file[49]).split('+/-')[0].strip())
    centerUnc = float(re.sub('^.+?:','',file[49]).split('+/-')[1].strip())
    
    # Land Percentage
    landPercent = float(re.findall('^.+?:\s(.+)%',file[37])[0].strip())

    # Convert list to DataFrame
    df = pd.DataFrame(lst, columns = h_list) 
   
    # Add Center mean and uncertainty to df
    df['centerMean'] = centerMean
    df['centerUnc'] = centerUnc
    
    # Add Land area percent to df
    df['landPercent'] = landPercent
   
    # Create Datetime Index
    df['Date'] = pd.to_datetime(df[['Year', 'Month']].assign(DAY=1))

    # Set data to numeric
    df[h_list[2:]] = df[h_list[2:]].apply(pd.to_numeric, errors='coerce')
    
    # Add Country Name column
    df['Country'] = countryName
    
    return df


In [2]:
# specify the url
list_page = 'http://berkeleyearth.lbl.gov/country-list/'

# query the website and return the html to the variable ‘page’
page = urllib.request.urlopen(list_page)

# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')

# Create a list of each country link
table = soup.find('table', attrs={'class': 'table table-condensed table-hover'})
rows = table.findAll('tr')
links = []
for tr in rows:
    col = tr.findAll('td')
    if len(col) > 0:
        links.append(col[0].find('a').get('href'))

In [3]:
textFileList = []
for countryLink in links:
    sCountryLink = urllib.parse.quote(countryLink, safe=':/()')
    with urllib.request.urlopen(sCountryLink) as page:
        soup = BeautifulSoup(page, 'html.parser')
    for a in soup.findAll('a'):
        if a.has_attr('href') and str(a['href']).endswith('TAVG-Trend.txt'):
            textFileList.append(a['href'])


In [6]:
df = pd.concat([importBerkeleyData(url) for url in textFileList],ignore_index=True)

In [7]:
# Create directory path in Data folder
exFileName = 'Temp_countries'
fullname = Wr.makeDirFile(exFileName)

# Export CSV
export_csv = df.to_csv(fullname, index=False, header=True)