In [87]:
import csv

urls = [
    'https://www.wetterdienst.de/Deutschlandwetter/Garbsen/Aktuell/',
    'https://www.topweather.net/en/weather-anderten-niedersachsen-de391711/',
    'https://de.euronews.com/wetter/europa/deutschland/langenhagen'
]

with open('urls.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Website', 'URL'])
    for url in urls:
        website = url.split('/')[2]
        writer.writerow([website, url])


In [88]:
import pandas as pd

df = pd.read_csv('urls.csv')
df.head



<bound method NDFrame.head of                Website                                                URL
0  www.wetterdienst.de  https://www.wetterdienst.de/Deutschlandwetter/...
1   www.topweather.net  https://www.topweather.net/en/weather-anderten...
2      de.euronews.com  https://de.euronews.com/wetter/europa/deutschl...>

https://www.wetterdienst.de/Deutschlandwetter/Garbsen/Aktuell/
https://www.topweather.net/en/weather-anderten-niedersachsen-de391711/
https://de.euronews.com/wetter/europa/deutschland/langenhagen

In [20]:
import pandas as pd
import h5py
import numpy as np
import re
import datetime as dt
import os
from bs4 import BeautifulSoup
import requests

In [21]:
filename = 'temp_data.h5'

# If file exists, open it in append mode, otherwise create it
if os.path.isfile(filename):
    f = h5py.File(filename, 'a')
else:
    f = h5py.File(filename, 'w')

# Read URLs from the CSV file using pandas
df = pd.read_csv('urls.csv')
urls = df['URL']

# Loop over the websites and scrape the temperature and timestamp data
for i, url in enumerate(urls):
    # Send a request to the website and get the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Get the current temperature
    if i == 0:  # For the first website
        temp = soup.find(class_='vorhersage_schrift2', id='temp_1').text.split('\n')[2]
        temp = re.split('[:°]', temp)
        temp = temp[1]
    elif i == 1:  # For the second website
        temp = soup.find('span', {'data-unit': 'c'}).text
        temp = re.split('°', temp)
        temp = temp[0]
    else:  # For the third website
        temp = soup.find('span', class_='c-current-weather__temperature unit_C ltr').text.strip().split('\n')
        temp = temp[0]

    # Convert temperature to float
    temp = np.array([temp]).astype(float)

    # Get the current date and time
    timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Save the temperature and timestamp data to the HDF5 file
    with h5py.File(filename, "a") as f:
        # Check if group already exists
        group_name = url.split("//")[-1].split("/")[0]
        if group_name in f:
            group = f[group_name]
        else:
            group = f.create_group(group_name)

        # Check if dataset already exists
        if "temperature" in group:
            dset_temp = group["temperature"]
        else:
            dset_temp = group.create_dataset("temperature", data=temp)

        if "timestamp" in group:
            dset_timestamp = group["timestamp"]
        else:
            dset_timestamp = group.create_dataset("timestamp", data=np.string_(timestamp), dtype=h5py.special_dtype(vlen=str))

In [34]:
import h5py
import numpy as np

# Open the HDF5 file for reading
with h5py.File('temp_data.h5', 'r') as f:
    # Iterate over all groups in the file
    for group_name in f.keys():
        # Print the group name
        print(f"Group: {group_name}")
        
        # Get the group object
        group = f[group_name]
        
        # Iterate over all datasets in the group
        for dataset_name in group.keys():
            # Print the dataset name
            print(f"\tDataset: {dataset_name}")
            
            # Get the dataset object
            dataset = group[dataset_name]
            
            # Read the dataset into a NumPy array
            data = np.array(dataset)
            
            # Print the data
            print(f"\t\tData: {data}")

Group: de.euronews.com
	Dataset: temperature
		Data: [18.]
	Dataset: timestamp
		Data: b'2023-05-15 11:22:46'
Group: www.topweather.net
	Dataset: temperature
		Data: [14.7]
	Dataset: timestamp
		Data: b'2023-05-15 11:22:45'
Group: www.wetterdienst.de
	Dataset: temperature
		Data: [16.6]
	Dataset: timestamp
		Data: b'2023-05-15 11:22:45'


In [35]:
import numpy as np
import pandas as pd
import plotly.express as px
import h5py

# read data out of h5py file and create visualization
with h5py.File('temp_data.h5', 'r') as hdf:
    # retrieve datasets
    wetterdienst_timestamp = np.array(hdf.get("www.wetterdienst.de/timestamp")).astype('datetime64', copy=False)
    wetterdienst_temperature = np.array(hdf.get("www.wetterdienst.de/temperature"))
    euro_timestamp = np.array(hdf.get("de.euronews.com/timestamp")).astype('datetime64', copy=False)
    euro_temperature = np.array(hdf.get("de.euronews.com/temperature"))
    topweather_timestamp = np.array(hdf.get("www.topweather.net/timestamp")).astype('datetime64', copy=False)
    topweather_temperature = np.array(hdf.get("www.topweather.net/temperature"))
    
    
    # create dataframes
    wetterdienst_df = pd.DataFrame({'timestamp': wetterdienst_timestamp, 'temperature': wetterdienst_temperature, 'source': 'Wetterdienst'})
    euro_df = pd.DataFrame({'timestamp': euro_timestamp, 'temperature': euro_temperature, 'source': 'Euronews'})
    topweather_df = pd.DataFrame({'timestamp': topweather_timestamp, 'temperature': topweather_temperature, 'source': 'Topweather'})

    # concatenate dataframes
    df = pd.concat([wetterdienst_df, euro_df, topweather_df], axis=0)

    # create plot with title and subtitle
    fig = px.line(df, x='timestamp', y='temperature', color='source', title='Temperature Development by Source', 
                  labels={'timestamp': 'Timestamp', 'temperature': 'Temperature'},
                  template='plotly_white')
    fig.update_layout(
        title={
            'text': 'Temperature Development by Source',
            'font': {'size': 20},
            'x': 0.5,
            'xanchor': 'center'},
        xaxis_title='Timestamp',
        yaxis_title='Temperature in C°'
    )
    fig.show()


In [32]:
import h5py
import numpy as np
import pandas as pd
import requests
import re
import datetime as dt
import os

filename = 'temp_data.h5'

# If file exists, open it in append mode, otherwise create it
if os.path.isfile(filename):
    f = h5py.File(filename, 'a')
else:
    f = h5py.File(filename, 'w')

# Read URLs from the CSV file using pandas
df = pd.read_csv('urls.csv')
urls = df['URL']

# Loop over the websites and scrape the temperature and timestamp data
for i, url in enumerate(urls):
    # Send a request to the website and get the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Get the current temperature
    if i == 0:  # For the first website
        temp = soup.find(class_='vorhersage_schrift2', id='temp_1').text.split('\n')[2]
        temp = re.split('[:°]', temp)
        temp = temp[1]
    elif i == 1:  # For the second website
        temp = soup.find('span', {'data-unit': 'c'}).text
        temp = re.split('°', temp)
        temp = temp[0]
    else:  # For the third website
        temp = soup.find('span', class_='c-current-weather__temperature unit_C ltr').text.strip().split('\n')
        temp = temp[0]

    # Convert temperature to float
    temp = np.array([temp]).astype(float)

    # Get the current date and time
    timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Save the temperature and timestamp data to the HDF5 file
    with h5py.File(filename, "a") as f:
        # Check if group already exists
        group_name = url.split("//")[-1].split("/")[0]
        if group_name in f:
            group = f[group_name]
        else:
            group = f.create_group(group_name)

        # Check if dataset already exists
        if "temperature" in group:
            dset_temp = group["temperature"]
            dset_temp.resize(dset_temp.shape[0] + 1, axis=0)
            dset_temp[-1] = temp
        else:
            dset_temp = group.create_dataset("temperature", data=temp)

        if "timestamp" in group:
            dset_timestamp = group["timestamp"]
            dset_timestamp.resize((dset_timestamp.shape[0] + 1), axis=0)
            dset_timestamp[-1] = np.string_(timestamp)
        else:
            dset_timestamp = group.create_dataset("timestamp", data=np.string_(timestamp), dtype=h5py.special_dtype(vlen=str))


IndexError: tuple index out of range

In [36]:
import h5py
import numpy as np
import pandas as pd
import requests
import re
import datetime as dt
import os
import logging

# Set up logging
logging.basicConfig(filename='temp_data.log', level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

filename = 'temp_data.h5'

# If file exists, open it in append mode, otherwise create it
if os.path.isfile(filename):
    f = h5py.File(filename, 'a')
else:
    f = h5py.File(filename, 'w')

# Read URLs from the CSV file using pandas
df = pd.read_csv('urls.csv')
urls = df['URL']

# Loop over the websites and scrape the temperature and timestamp data
for i, url in enumerate(urls):
    # Send a request to the website and get the HTML content
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Get the current temperature
    if i == 0:  # For the first website
        temp = soup.find(class_='vorhersage_schrift2', id='temp_1').text.split('\n')[2]
        temp = re.split('[:°]', temp)
        temp = temp[1]
    elif i == 1:  # For the second website
        temp = soup.find('span', {'data-unit': 'c'}).text
        temp = re.split('°', temp)
        temp = temp[0]
    else:  # For the third website
        temp = soup.find('span', class_='c-current-weather__temperature unit_C ltr').text.strip().split('\n')
        temp = temp[0]

    # Convert temperature to float
    temp = np.array([temp]).astype(float)

    # Get the current date and time
    timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # Save the temperature and timestamp data to the HDF5 file
    with h5py.File(filename, "a") as f:
        # Check if group already exists
        group_name = url.split("//")[-1].split("/")[0]
        if group_name in f:
            group = f[group_name]
        else:
            group = f.create_group(group_name)

        # Check if dataset already exists
        if "temperature" in group:
            dset_temp = group["temperature"]
            dset_temp.resize(dset_temp.shape[0] + 1, axis=0)
            dset_temp[-1] = temp
        else:
            dset_temp = group.create_dataset("temperature", data=temp, chunks=True, maxshape=(None,))

        if "timestamp" in group:
            dset_timestamp = group["timestamp"]
            dset_timestamp.resize(dset_timestamp.shape[0] + 1, axis=0)
            dset_timestamp[-1] = np.string_(timestamp)
        else:
            dset_timestamp = group.create_dataset("timestamp", data=np.string_(timestamp), chunks=True, maxshape=(None,), dtype=h5py.special_dtype(vlen=str))
    
    # Log the data saved to the file
    logging.info(f"{timestamp} - {url} - Temperature: {temp[0]}")
    
# Log the end of the script
logging.info("Temperature data saved to HDF5 file.")


TypeError: Only chunked datasets can be resized