In [1]:
import re
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as soup
from typing import List, Union, Dict

In [2]:
BASE_URL_DST_FINAL = 'https://wdc.kugi.kyoto-u.ac.jp/dst_final/'

In [150]:
def get_month_data(year: str, month: str) -> Union[str, Exception]:
    """
    year [str]: YYYYY
    month [str]: DD
    """
    url = f"{BASE_URL_DST_FINAL}/{year}{month}/index.html"
    try:
        ans = requests.get(url=url)
        data = soup(ans.text, "html.parser")
        text_from_html = data.findAll("pre")[0].text
        return text_from_html
    except Exception as error:
        raise error

In [154]:
def clean_month_data_text(data_text: str) -> List[str]:
    """
    data_text [str]
    """
    raw_data = list()
    _ = [raw_data.append(i) for i in data_text.split('\n') if i != '']
    return raw_data[6:]


def clean_single_line(line: str) -> List[int]:
    """
    line [str]
    """
    # seleciona somente os valores de dst dentro da lista
    # quebra o texto em 3 blocos com 33 caracteres
    split_three_blocks = re.findall('.................................', line[2:])
    # remove o primeiro caracter de cada bloco
    clean_blocks = list()
    _ = [clean_blocks.append(i[1:]) for i in split_three_blocks]
    # separa os blocos em conjuntos de 4 caracteres
    # converte valores de string para inteiro
    separated_values = [int(i) for i in re.findall('....', ''.join(clean_blocks))]
    return separated_values


def clean_multiples_lines(lines: List[str]):
    lines_ok = list()
    _ = [lines_ok.append(clean_single_line(i)) for i in lines]
    return lines_ok

In [155]:
def generate_df(trusted_data: List[List[int]], year: str, month: str) -> pd.DataFrame:
    """
    """
    df = pd.DataFrame(trusted_data, columns=list(range(1, 25)))
    df.index = df.index+1
    df['date'] = [pd.to_datetime(f"{year}-{month}-{day}", format='%Y-%m-%d') for day in df.index]
    df['dst_min'] = [df.loc[i, np.array(range(1, 25))].min() for i in range(1, len(df)+1)]
    return df

In [158]:
def make_classification(df: pd.DataFrame, classification_rules: Dict[str, List[int]]):
    """
    df [DataFrame]
    classification_rules [Dict[str, List[int]]]
        example:
            {
                'fraca':            np.array(range(-31, -51, -1)),
                'moderada':         np.array(range(-51, -101, -1)),
                'intensa':          np.array(range(-101, -251, -1)),
                'super_intensa':    np.array(range(-251, -1001, -1)),
            }
    """
    df['classification'] = np.nan
    for i in range(1, len(df)+1):
        for category, index_range in classification_rules.items(): 
            if df.loc[i, 'dst_min'] in index_range:
                df.loc[i, 'classification'] = category
                break

In [177]:
def remove_storms_by_date(df: pd.DataFrame, dates: List[str]):
    """
    df: DataFrame
        columns: 
            date pd.datetime
    dates: list[str]
        format: YYYY-MM-DD
    """
    format_dates = [pd.to_datetime(date, format='%Y-%m-%d') for date in dates]
    boolean_mask = [date not in format_dates for date in df['date']]
    final_mask = pd.Series(boolean_mask, name='date', index=list(range(1, len(df)+1)))
    filtered_df = df[final_mask]
    filtered_df.reset_index(drop=True, inplace=True)
    return filtered_df

In [161]:
def plot_dst_graph(df):
    # alta resolução
    axis_x = [(i+1)/df.columns.size for i in range(df.columns.size*len(df))]

    elements = [df.iloc[i].to_list() for i in range(len(df))]
    axis_y = list()
    for element in elements:
        axis_y += element

    plt.plot(axis_x, axis_y)

    # baixa resolução
    # df.min(axis=1).plot()

In [163]:
_ = \
"""
[Fraca]          -30 nT > Dst >=  -50 nT
[Moderada]       -50 nT > Dst >= -100 nT
[Intensa]       -100 nT > Dst >= -250 nT
[SuperIntensa]  -250 nT > Dst 
"""

classification_rules = {
    'fraca':            np.array(range(-31, -51, -1)),
    'moderada':         np.array(range(-51, -101, -1)),
    'intensa':          np.array(range(-101, -251, -1)),
    'super_intensa':    np.array(range(-251, -1001, -1)),
}

In [29]:
year = '2000'
month = '07'

month_data = get_month_data(year=year, month=month)
cleaned_data = clean_month_data_text(data_text=month_data)
trusted_data = clean_multiples_lines(lines=cleaned_data)


In [178]:
df = generate_df(trusted_data=trusted_data, year=year, month=month)

In [179]:
make_classification(df, classification_rules)        


In [180]:
storms_to_remove_by_date = [
    '2000-07-01',
    '2000-07-02',
    '2000-07-03',
    '2000-07-29',
    '2000-07-30'
]

In [181]:
df_filtered = remove_storms_by_date(df, storms_to_remove_by_date)

In [184]:
# df
df_filtered.dropna()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,18,19,20,21,22,23,24,date,dst_min,classification
9,-7,-5,-1,0,0,5,8,6,3,16,...,-8,-15,-25,-38,-43,-41,-37,2000-07-13,-43,fraca
11,-34,-26,-26,-33,-29,-18,-19,-22,-33,-39,...,-46,-43,-61,-198,-289,-281,-281,2000-07-15,-289,super_intensa
12,-301,-301,-279,-260,-233,-209,-197,-179,-175,-166,...,-126,-125,-124,-121,-117,-117,-117,2000-07-16,-301,super_intensa
13,-114,-112,-107,-101,-94,-87,-81,-76,-68,-71,...,-54,-48,-45,-42,-41,-40,-41,2000-07-17,-114,intensa
14,-41,-36,-33,-38,-36,-36,-41,-37,-37,-40,...,-32,-27,-17,-14,-23,-26,-32,2000-07-18,-42,fraca
15,-34,-36,-34,-35,-34,-30,-24,-19,-20,-25,...,-2,4,2,-3,-7,-21,-29,2000-07-19,-36,fraca
16,-27,-30,-48,-63,-72,-71,-64,-78,-84,-93,...,-73,-60,-61,-56,-56,-59,-62,2000-07-20,-93,moderada
17,-60,-57,-52,-51,-49,-44,-41,-36,-34,-35,...,-41,-39,-35,-31,-29,-23,-22,2000-07-21,-60,moderada
18,-26,-27,-30,-34,-33,-31,-26,-25,-31,-39,...,-63,-57,-51,-45,-47,-44,-39,2000-07-22,-63,moderada
19,-41,-37,-32,-38,-39,-36,-24,-20,-22,-18,...,-42,-41,-44,-47,-58,-68,-63,2000-07-23,-68,moderada
