In [361]:
import re
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as soup
from typing import List, Union

In [362]:
BASE_URL_DST_FINAL = 'https://wdc.kugi.kyoto-u.ac.jp/dst_final/'

In [363]:
def get_month_data(year: str, month: str) -> Union[str, Exception]:
    """
    year str: YYYYY
    month str: DD
    """
    url = f"{BASE_URL_DST_FINAL}/{year}{month}/index.html"
    try:
        ans = requests.get(url=url)
        data = soup(ans.text, "html.parser")
        text_from_html = data.findAll("pre")[0].text
        return text_from_html
    except Exception as error:
        raise error

In [364]:
def clean_month_data_text(data_text: str) -> List[str]:
    raw_data = list()
    _ = [raw_data.append(i) for i in data_text.split('\n') if i != '']
    return raw_data[6:]

In [365]:
def clean_single_line(line: str) -> List[int]:
    """
    """
    # seleciona somente os valores de dst dentro da lista
    # quebra o texto em 3 blocos com 33 caracteres
    split_three_blocks = re.findall('.................................', line[2:])
    # remove o primeiro caracter de cada bloco
    clean_blocks = list()
    _ = [clean_blocks.append(i[1:]) for i in split_three_blocks]
    # separa os blocos em conjuntos de 4 caracteres
    # converte valores de string para inteiro
    separated_values = [int(i) for i in re.findall('....', ''.join(clean_blocks))]
    return separated_values

def clean_multiples_lines(lines: List[str]):
    lines_ok = list()
    _ = [lines_ok.append(clean_single_line(i)) for i in lines]
    return lines_ok


In [366]:
def generate_df(trusted_data: List[List[int]]) -> pd.DataFrame:
    """
    """
    df = pd.DataFrame(trusted_data, columns=list(range(1, 25)))
    df.index = df.index+1
    return df

In [369]:
year = '2000'
month = '05'

month_data = get_month_data(year=year, month=month)
cleaned_data = clean_month_data_text(data_text=month_data)
trusted_data = clean_multiples_lines(lines=cleaned_data)


In [371]:
df = generate_df(trusted_data=trusted_data)

In [372]:
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,21,22,23,24
1,-7,-14,-14,-16,-13,-12,-16,-17,-16,-12,...,-13,-20,-24,-23,-28,-31,-32,-27,-25,-23
2,-24,-32,-29,-23,-15,-14,-14,-19,-20,-11,...,-22,-33,-32,-33,-34,-35,-37,-35,-32,-29
3,-23,-21,-21,-19,-16,-15,-13,-9,-9,-4,...,8,9,1,-19,-19,-29,-34,-35,-28,-27
4,-29,-31,-32,-30,-22,-17,-16,-12,-8,-6,...,-16,-12,-13,-12,-15,-18,-18,-15,-14,-14
5,-16,-17,-14,-11,-7,-10,-10,-13,-12,-12,...,-14,-12,-10,-10,-12,-14,-12,-8,-5,-8
6,-15,-20,-22,-24,-28,-29,-25,-24,-23,-18,...,-12,-9,-9,-6,-5,-4,-4,-4,-3,-6
7,-10,-10,-7,-6,-5,-5,-1,0,2,5,...,4,3,2,4,6,6,7,9,13,9
8,7,8,13,16,19,21,26,29,25,22,...,24,23,22,23,20,16,17,12,10,10
9,9,8,5,6,7,10,15,12,3,-1,...,-7,-3,-2,-5,-8,-11,-7,-7,-12,-19
10,-21,-23,-18,-13,-16,-12,-9,-9,-8,-8,...,-11,-12,-13,-13,-12,-9,-4,-1,-1,0
