In [24]:
from tqdm import tqdm
import requests
import zipfile
import os
from io import BytesIO
import pandas as pd
from tabulate import tabulate
import chardet
import itertools

In [25]:
EXTRACT_DIR = './data/raw'
YEARS = range(2010, 2026)
MONTHS = [f"{i:02d}" for i in range(1, 13)]
PROCEEDS_DIR = './data/processed'

In [26]:
def download_file(url, out_path):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

    
        with open(out_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
            
        return True
    except Exception as e:
        return False
        
def handle_zip(zip_path, rm_zip=True):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            extract_path = os.path.splitext(zip_path)[0]
            zip_ref.extractall(extract_path)
        if rm_zip:
            os.remove(zip_path)
        return True
    except Exception as e:
        return False
        

In [27]:
if not os.path.exists(EXTRACT_DIR):
    os.makedirs(EXTRACT_DIR, exist_ok=True)
    total_iterations = len(YEARS) * len(MONTHS)
    
    for y, m in tqdm(itertools.product(YEARS, MONTHS), total=total_iterations, desc="Download progress"):
            uri = f"https://danepubliczne.imgw.pl/pl/datastore/getfiledown/Arch/Telemetria/Meteo/{y}/Meteo_{y}-{m}.zip"
            filename = uri.split('/')[-1]
            out_path = os.path.join(EXTRACT_DIR, filename)
            if download_file(uri, out_path):

                if handle_zip(out_path, rm_zip=True) == False:
                    uri = f"https://danepubliczne.imgw.pl/pl/datastore/getfiledown/Arch/Telemetria/Meteo/{y}/Meteo_{y}-{m}.ZIP"
                    out_path = os.path.join(EXTRACT_DIR, filename, "_2")
                    if download_file(uri, out_path):
                        handle_zip(out_path, rm_zip=True) == False

Download progress: 100%|██████████| 192/192 [07:21<00:00,  2.30s/it]


In [28]:
paths = []
for root, dirs, files in os.walk(EXTRACT_DIR):
    for file in files:
        if file.endswith('.zip'):
            os.remove(os.path.join(root, file))

In [29]:
os.makedirs(EXTRACT_DIR, exist_ok=True)

paths = []
for root, dirs, files in os.walk(EXTRACT_DIR):
    for file in files:
        if file.endswith('.csv'):
            full_path = os.path.join(root, file)
            paths.append(full_path)
            
print(f"\nZnaleziono {len(paths)} plików CSV.")


Znaleziono 1771 plików CSV.


In [None]:
headers = ["Station ID", "Param", "Datetime", "Value"]
KOD_WROCLAW_STRACHOWICE = 351160424
KOD_WROCLAW = 351160425

raw_dfs=[]
for file in tqdm(paths, desc="Wczytywanie plików CSV"):
    df = pd.read_csv(file, sep=';', header=None, names=headers, encoding='UTF-8', index_col=False, low_memory=False)
    df = df[(df['Station ID'] == KOD_WROCLAW_STRACHOWICE) | (df['Station ID'] == KOD_WROCLAW)]
    raw_dfs.append(df)

  df = pd.read_csv(file, sep=';', header=None, names=headers, encoding='UTF-8', index_col=False, low_memory=False)
Wczytywanie plików CSV: 100%|██████████| 1771/1771 [12:34<00:00,  2.35it/s]

+--------------+---------+------------+---------+
| Station ID   | Param   | Datetime   | Value   |
|--------------+---------+------------+---------|
+--------------+---------+------------+---------+





In [31]:
all_data = pd.concat(raw_dfs, ignore_index=True)
all_data.reset_index(drop=True, inplace=True)

del raw_dfs

print(f"Rozmiar połączonego DataFrame: {all_data.shape}")
print(tabulate(all_data.head(), headers='keys', tablefmt='psql'))

Rozmiar połączonego DataFrame: (6557694, 4)
+----+--------------+---------+------------------+---------+
|    |   Station ID | Param   | Datetime         | Value   |
|----+--------------+---------+------------------+---------|
|  0 |    351160424 | B00300S | 2010-11-01 00:00 | 12,2    |
|  1 |    351160424 | B00300S | 2010-11-01 01:00 | 12,4    |
|  2 |    351160424 | B00300S | 2010-11-01 02:00 | 11,7    |
|  3 |    351160424 | B00300S | 2010-11-01 03:00 | 10,1    |
|  4 |    351160424 | B00300S | 2010-11-01 04:00 | 8,6     |
+----+--------------+---------+------------------+---------+


In [32]:
all_data['Value'] = pd.to_numeric(all_data['Value'], errors='coerce')
all_data_pivot = all_data.pivot_table(index=['Station ID', 'Datetime'], columns='Param', values='Value').reset_index()
print(f"Rozmiar DataFrame po pivotowaniu: {all_data_pivot.shape}")
print(tabulate(all_data_pivot.describe(), headers='keys', tablefmt='psql'))

Rozmiar DataFrame po pivotowaniu: (1038222, 13)
+-------+--------------+---------------+--------------+--------------+-------------+---------------+-----------------+--------------+--------------+------------+---------------+-----------+
|       |   Station ID |       B00202A |      B00300S |      B00305A |     B00604S |       B00606S |         B00608S |      B00702A |      B00703A |    B00714A |       B00802A |   B00910A |
|-------+--------------+---------------+--------------+--------------+-------------+---------------+-----------------+--------------+--------------+------------+---------------+-----------|
| count |  1.03822e+06 |   1.02809e+06 | 220272       | 221700       | 2946        | 96208         | 283058          | 234814       | 403247       | 5470       |   1.03749e+06 | 149       |
| mean  |  3.5116e+08  | 201.535       |     11.3746  |     11.2665  |    0.657502 |     0.0186679 |      0.00529573 |      2.42255 |      4.18669 |   12.2274  |  74.4413      |  11.9866  |
| 

In [33]:
params_dic = {
    "B00300S": "Temperatura powietrza (oficjalna)",
    "B00305A": "Temperatura gruntu (czujnik)",
    "B00202A": "Kierunek wiatru (czujnik)",
    "B00702A": "Średnia prędkość wiatru czujnik 10 minut",
    "B00703A": "Prędkość maksymalna (czujnik)",
    "B00608S": "Suma opadu 10 minutowego",
    "B00604S": "Suma opadu dobowego",
    "B00606S": "Suma opadu godzinowego",
    "B00802A": "Wilgotność względna powietrza (czujnik)",
    "B00714A": "Największy poryw w okresie 10min ze stacji Synoptycznej",
    "B00910A": "Zapas wody w śniegu (obserwator)"
}

all_data_pivot.rename(columns=params_dic, inplace=True)
all_data_pivot['Datetime'] = pd.to_datetime(all_data_pivot['Datetime'], format='%Y-%m-%d %H:%M')
all_data_pivot.sort_values(by=['Station ID', 'Datetime'], inplace=True)
all_data_pivot.reset_index(drop=True, inplace=True)

print("OPIS")
print(tabulate(all_data_pivot.describe(), headers='keys', tablefmt='psql'))

print("GŁOWA")
print(tabulate(all_data_pivot.head(), headers='keys', tablefmt='psql'))

OPIS
+-------+--------------+-------------------------------+-----------------------------+-------------------------------------+--------------------------------+-----------------------+--------------------------+----------------------------+--------------------------------------------+---------------------------------+-----------------------------------------------------------+-------------------------------------------+------------------------------------+
|       |   Station ID | Datetime                      |   Kierunek wiatru (czujnik) |   Temperatura powietrza (oficjalna) |   Temperatura gruntu (czujnik) |   Suma opadu dobowego |   Suma opadu godzinowego |   Suma opadu 10 minutowego |   Średnia prędkość wiatru czujnik 10 minut |   Prędkość maksymalna (czujnik) |   Największy poryw w okresie 10min ze stacji Synoptycznej |   Wilgotność względna powietrza (czujnik) |   Zapas wody w śniegu (obserwator) |
|-------+--------------+-------------------------------+-----------------------

In [34]:
os.makedirs(PROCEEDS_DIR, exist_ok=True)
YEARS = all_data_pivot['Datetime'].dt.year.unique()
print(f"YEARS: {YEARS}")

for year in tqdm(YEARS):
    fin_path = os.path.join(PROCEEDS_DIR, f"meteo_wroclaw_{year}.csv")
    df_year = all_data_pivot[all_data_pivot['Datetime'].dt.year == year]
    df_year.to_csv(fin_path, index=False, encoding='UTF-8')

YEARS: [2010 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
 2025]


100%|██████████| 15/15 [00:08<00:00,  1.86it/s]
