In [4]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from datetime import datetime
from datetime import timedelta
import pandas as pd

# -*- coding: utf -*-

In [5]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def find_between(s, first, last):
    try:
        start = s.index(first) + len(first)
        end = s.index(last, start)
        return s[start:end]
    except ValueError:
        return ""
    
def cut_string(s, cut):
    try:
        cut_from = s.index(cut) + len(cut)
        return s[cut_from:]
    except ValueError:
        return ""

In [37]:
class forecast(object):
    def __init__(max_temp, min_temp, proc_date, acc_date):
        self.max_temp = max_temp
        self.min_temp = min_temp
        self.proc_date = proc_date
        self.acc_date = acc_date
        
data = {}
raw_html = simple_get('https://www.wetter.de/deutschland/wetter-berlin-18228265/wetterprognose.html')
daily_dict = {}
current_time = pd.Timestamp(datetime.now())

html_string = raw_html.decode("utf-8")

min_day_string = '<span class="wt-color-temperature-min">'
max_day_string = '<span class="wt-color-temperature-max">'
temp_string = '<div class="forecast-text-temperature wt-font-light">'
closing_string = '&deg;</span>'
closing_string2 = '&deg;</div>'

min_temp = []
max_temp = []
proc_date = []
four_temps = []
four_regen = []
four_wind = []

for day in range(0,15):
    min_temp.append(find_between(html_string, min_day_string, closing_string))
    max_temp.append(find_between(html_string, max_day_string, closing_string))
    proc_date.append((current_time + timedelta(days=day)).date())
    
    for part in range(4):
        four_temps.append(find_between(html_string, temp_string, closing_string2))
        html_string = cut_string(html_string,temp_string)
    
daily_dict['min_temp'] = min_temp
daily_dict['max_temp'] = max_temp
daily_dict['proc_date'] = proc_date
daily_dict['city'] = 'Berlin'
daily_dict['aq_time'] = current_time
daily_dict['night_temp'] = four_temps[::4]
daily_dict['morning_temp'] = four_temps[1::4]
daily_dict['midday_temp'] = four_temps[2::4]
daily_dict['evening_temp'] = four_temps[3::4]

daily = pd.DataFrame(data=daily_dict)
print(daily)

                      aq_time    city evening_temp max_temp midday_temp  \
0  2018-05-15 10:09:56.812430  Berlin           16       20          19   
1  2018-05-15 10:09:56.812430  Berlin           18       21          19   
2  2018-05-15 10:09:56.812430  Berlin           17       19          18   
3  2018-05-15 10:09:56.812430  Berlin           16       20          19   
4  2018-05-15 10:09:56.812430  Berlin           17       20          19   
5  2018-05-15 10:09:56.812430  Berlin           18       21          20   
6  2018-05-15 10:09:56.812430  Berlin           19       22          21   
7  2018-05-15 10:09:56.812430  Berlin           20       22          20   
8  2018-05-15 10:09:56.812430  Berlin           20       22          19   
9  2018-05-15 10:09:56.812430  Berlin           21       23          19   
10 2018-05-15 10:09:56.812430  Berlin           22       24          20   
11 2018-05-15 10:09:56.812430  Berlin           20       22          19   
12 2018-05-15 10:09:56.81

In [29]:
# city
# hourly/daily
# timestamp aquired
# timestamp - predicte