### Libs import

In [3]:
import numpy as np
import seaborn as sns
import pandas as pd
from scipy.special import expit
from scipy.stats import zscore
from pandas.errors import SettingWithCopyWarning
import os
import io
from os import listdir
from os import system
import subprocess
from functools import reduce

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.tsa.stattools as ts
from pandas.plotting import autocorrelation_plot
%matplotlib inline
from matplotlib import pyplot as plt
import statsmodels.api as sm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)




### Aux methods

In [1]:
def set_num_rows_cols(nr, nc):
    pd.set_option('display.max_columns', nc)
    pd.set_option('display.max_rows', nr)

def set_size_plot(x, y):
    sns.set_theme(rc={'figure.figsize':(x, y)})
    
def getSigmoid():
    arr = np.arange(0, 1, 0.02)
    np.hstack(np.vstack(arr))
    def sigmoid(x):
        return 1/(1+expit(-x))
    
    sigmoid(arr)
    sns.scatterplot(x=arr, y=sigmoid(arr))

def get_files(n=1, lazy=True):
    cwd = os.getcwd()

    base_path = "data/estacoes_solares/2023/"
    dict_data = {}
    list_files = [x for x in listdir(cwd + "/" + base_path) if ".csv" in x.lower()]    

    if n:
        _range = list_files[0:n]
    else:
        _range = list_files
    
    for i in _range:
        # print("Processando arquivo " + i)
        code = i.split("_")[3]
        
        cmd = ["head", "-8", cwd+ "/" + base_path + i]
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        
        result = p.communicate()
        head = result[0].decode('ISO-8859-1')

        if lazy:
            data = base_path + i
        else:
            data = pd.read_csv(base_path+i, sep=";", skiprows=10)
        
        dict_data[code] = [head, data]

    return dict_data

def getDictToRenameDataFrame(list_columns):
    list_columns_new = []
    for i in list_columns: 
        list_columns_new.append(str
              .lower(i)
              .replace(" ", "_")
              . replace("(", "_")
              .replace(")", "")
              .replace("/", "")
              .replace("²", "2")
              .replace("°", "")
              .replace("%", "perc")
              .replace("._", "_")
              .replace(".", "_")
              .replace("__", "_")
              .replace("_-_", "_")
              .replace(",_", "_")
             )
    
    return dict(zip(list_columns, list_columns_new))

def cols_standardization(df):
    list_columns = df.columns

    list_dict_to_rename = getDictToRenameDataFrame(list_columns)
    
    return df.rename(columns=list_dict_to_rename).rename(columns={"radiacao_kjm2": "radiacao"})

def hour_transform(n):
    if len(str(n)) == 4:
        return str(n)[0:2] + ":" + str(n)[2:] + ":00"
    elif len(str(n)) == 3:
        return "0" + str(n)[0:1] + ":" + str(n)[1:] + ":00"
    elif n == 0:
        return "00:00:00"

def create_datetime_feature(df):
    df["hora_medicao"] = df["hora_medicao"].apply(hour_transform)
    df["data_hora_str"] = df["data_medicao"] + " " + df["hora_medicao"]
    df["data_hora"] = pd.to_datetime(df["data_hora_str"], format="%d-%m-%Y %H:%M:%S")
    df["data"] = pd.to_datetime(df["data_medicao"], format="%d-%m-%Y")
    df["data_str"] = df["data_hora_str"].str.split().str[0]
    df["hora"] = df['data_hora'].dt.hour
    
    return df.drop(["data_hora_str", "data_medicao", "hora_medicao"], axis=1)
    
def transform_datetime(df):
    df["data_medicao"] = df["data"].str.replace('/', '-', regex=False)
    df["hora_medicao"] = df["hora_utc"].astype("str").str.replace(' UTC', '', regex=False).astype('int32')
    return df.drop(["data", "hora_utc"], axis=1)
    
def create_split_date_features(df):
    df["dia"] = df["data_hora"].dt.day
    df["mes"] = df["data_hora"].dt.month
    df["ano"] = df["data_hora"].dt.year

    return df

def create_category(column, df):
    labels = ["A", "B", "C", "D"]
    classes = df.describe()[column][3:8].values

    if classes[1] == 0:
        classes[1] = classes[1]+0.1
        print("aqui")
    print(classes)
    
    return pd.cut(x = df[column],
         bins = classes,
         labels = labels,
         include_lowest = True)

def removeNulls(df, col):
    return df[df[col].notnull()]

def pre_processing(raw_df):
    df = cols_standardization(raw_df)
    df = transform_datetime(df)
    df = create_datetime_feature(df)
    df = create_split_date_features(df)
    df = removeNulls(df, "radiacao")
    df = change_types(df)
    
    return df

def reduce_df(list_df, key):
    if not key:
        return reduce(lambda x, y: pd.merge(x, y), list_df)
        
    return reduce(lambda x, y: pd.merge(x, y, on = key), list_df)

def load(path):
    return pd.read_csv(path, sep=";", encoding = "ISO-8859-1", skiprows=8)

def get_perc_nulls(df):
    return (df.isnull().sum()/(len(df)))*100

def get_percentils(df, col):
    for i in range(0, 101):
        value_str = str(i)
    
        if len(value_str) == 1:
            value_str = "0.0"+value_str
        
        elif len(value_str) == 2:
            value_str = "0."+value_str
        else:
            value_str = "1.0"
        
        double_value = float(value_str) 
        print(value_str, df[col].quantile(double_value))

def plot_by_col(df, col_grouped, col_target):
    df = df[[col_grouped, col_target]].groupby([col_grouped]).mean().reset_index()
    
    sns.lineplot(data=df, x=df[col_grouped], y=df[col_target])

def plot_by_range(df, col_x, col_y, dt_start, dt_end):
    df = filter_between(df, col_x, dt_start, dt_end)
    
    sns.lineplot(data=df, x=df[col_x], y=df[col_y])

def filter_between(df, col, value_1, value_2):
    _filter = (df[col] >= value_1) & (df[col] <= value_2)
    return df[_filter]

def set_plot_size(x, y):
    sns.set_theme(rc={'figure.figsize':(x,y)})

def change_types(df):
    list_columns = df.drop("data_str", axis=1).columns

    for i in list_columns:
        if 'object' in df[i].dtypes.name:
            df[i] = df[i].str.replace(",", ".").astype('float64')
            
        else:
            continue
           
        
    return df

def treat_columns(df):

    columns_to_drop = ['pressão_atmosferica_max_na_hora_ant_aut_mb',
                       'pressão_atmosferica_min_na_hora_ant_aut_mb', 
                       'temperatura_do_ponto_de_orvalho_c',
                       'temperatura_máxima_na_hora_ant_aut_c',
                       'temperatura_mínima_na_hora_ant_aut_c',
                       'temperatura_orvalho_max_na_hora_ant_aut_c',
                       'temperatura_orvalho_min_na_hora_ant_aut_c',
                       'umidade_rel_max_na_hora_ant_aut_perc',
                       'umidade_rel_min_na_hora_ant_aut_perc',
                      'vento_direção_horaria_gr__gr',
                      'vento_rajada_maxima_ms']

    columns_to_rename = {'precipitação_total_horário_mm': 'precipitacao',
                        'pressao_atmosferica_ao_nivel_da_estacao_horaria_mb': 'press_atmo',
                        'temperatura_do_ar_bulbo_seco_horaria_c': 'temperatura',
                        'umidade_relativa_do_ar_horaria_perc': 'umidade',
                         'vento_velocidade_horaria_ms':'vento_velocidade_horaria'}

    return df.rename(columns=columns_to_rename).drop(columns_to_drop, axis=1)

def my_autocov(df, col, interval=1):
    serie = np.array(df[col].to_list())

    # serie = np.array([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
    
    xt = serie[:-interval]
    xt1 = serie[interval:]
    n_pairs = len(xt)

    mean_xt = xt.sum(axis=0)/n_pairs
    mean_xt1 = xt1.sum(axis=0)/n_pairs

    dev_xt = xt - mean_xt
    dev_xt1 = xt1 - mean_xt1

    d = {"xt": xt, "xt1": xt1}
    df_new = pd.DataFrame(data=d)

    sns.scatterplot(df_new, x="xt", y="xt1")

    return dev_xt.dot(dev_xt1)/n_pairs

def plot_distrib_horario(df):
    local_anual_df = df[["data_hora", "radiacao", "temp_ins_c"]]
    local_anual_df["hora"] = local_anual_df["data_hora"].dt.hour
    
    local_anual_df = local_anual_df[["hora", "radiacao"]].groupby(['hora']).mean().reset_index()
    
    plot = sns.barplot(local_anual_df, x="hora", y="radiacao")
   