# Divisão dos Dados do P2MOD em Treino e Teste

In [1]:
import pandas as pd
import os
import warnings
import numpy as np
from sklearn.preprocessing import normalize

warnings.filterwarnings('ignore')

In [2]:
path = "/content/drive/MyDrive/Monografia/Dados Paranaguá/P2MOD/PARQUET"
data = {}

for file in os.listdir(path):
    if file.endswith('.parquet'):
        df = pd.read_parquet(os.path.join(path, file))
        data[file.split('.')[0]] = df

for key, df in data.items():
    print(f"{key}: {df.shape}")

ODAS_Corrente: (113094, 13)
Cattalini_Corrente: (273245, 7)
ODAS_Meteorologia: (104792, 4)
Cattalini_Meteorologia: (594571, 4)
Cattalini_Maregrafo: (273223, 2)
Porto_Maregrafo: (204582, 2)
Porto_Astronomica: (275851, 2)
Porto_Harmonico: (137926, 2)


# ODAS Corrente

In [3]:
odas_corrente = data['ODAS_Corrente'][['Datetime', 'Velocidade 1 [m/s]', 'Direção 1 [°N]']]
odas_corrente.columns = ['datetime', 'cs', 'cd']
odas_corrente['sin_cd'] = np.sin(np.deg2rad(odas_corrente['cd']))
odas_corrente['cos_cd'] = np.cos(np.deg2rad(odas_corrente['cd']))
odas_corrente.drop(columns=['cd'], inplace=True)

X = odas_corrente[['cs']]
X = normalize(X, axis=0)
odas_corrente[['cs']] = X

odas_corrente

Unnamed: 0,datetime,cs,sin_cd,cos_cd
0,2019-04-25 19:30:00+00:00,0.000784,-0.139519,-0.990219
1,2019-04-25 19:40:00+00:00,0.000998,-0.254939,-0.966957
2,2019-04-25 19:50:00+00:00,0.000926,-0.178802,-0.983885
3,2019-04-25 20:00:00+00:00,0.000428,-0.175539,-0.984473
4,2019-04-25 20:10:00+00:00,0.000570,-0.284685,-0.958621
...,...,...,...,...
113089,2021-12-08 14:20:00+00:00,0.014180,0.324413,-0.945916
113090,2021-12-08 14:30:00+00:00,0.008622,0.618134,0.786073
113091,2021-12-08 14:40:00+00:00,0.010973,0.707354,-0.706860
113092,2021-12-08 14:50:00+00:00,0.002850,-0.473166,-0.880973


# Cattalini Corrente

In [4]:
cattalini_corrente = data['Cattalini_Corrente'][['Datetime', 'Velocidade 1 [m/s]', 'Direção 1 [°N]']]
cattalini_corrente.columns = ['datetime', 'cs', 'cd']
cattalini_corrente['sin_cd'] = np.sin(np.deg2rad(cattalini_corrente['cd']))
cattalini_corrente['cos_cd'] = np.cos(np.deg2rad(cattalini_corrente['cd']))
cattalini_corrente.drop(columns=['cd'], inplace=True)

X = cattalini_corrente[['cs']]
X = normalize(X, axis=0)
cattalini_corrente

Unnamed: 0,datetime,cs,sin_cd,cos_cd
0,2019-04-25 19:30:00+00:00,0.046300,-0.813304,0.581839
1,2019-04-25 19:35:00+00:00,0.051444,-0.800731,0.599024
2,2019-04-25 19:40:00+00:00,0.036011,-0.783260,0.621695
3,2019-04-25 19:45:00+00:00,0.025722,-0.597625,0.801776
4,2019-04-25 19:50:00+00:00,0.020578,-0.984103,0.177600
...,...,...,...,...
273240,2021-12-08 14:40:00+00:00,0.221211,0.991872,-0.127238
273241,2021-12-08 14:45:00+00:00,0.241789,0.997637,0.068712
273242,2021-12-08 14:50:00+00:00,0.308667,0.999998,-0.002094
273243,2021-12-08 14:55:00+00:00,0.354967,0.999799,0.020070


# ODAS Meteorologia

In [5]:
odas_meteorologia = data['ODAS_Meteorologia'][['Datetime', 'Velocidade do Vento [m/s]', 'Direção do Vento [°N]']]
odas_meteorologia.columns = ['datetime', 'ws', 'wd']

odas_meteorologia['sin_wd'] = np.sin(np.deg2rad(odas_meteorologia['wd']))
odas_meteorologia['cos_wd'] = np.cos(np.deg2rad(odas_meteorologia['wd']))
odas_meteorologia.drop(columns=['wd'], inplace=True)

X = odas_meteorologia[['ws']]
X = normalize(X, axis=0)
odas_meteorologia[['ws']] = X

odas_meteorologia

Unnamed: 0,datetime,ws,sin_wd,cos_wd
0,2019-04-25 19:30:00+00:00,0.001502,0.956407,-0.292038
1,2019-04-25 19:40:00+00:00,0.001069,0.951164,-0.308685
2,2019-04-25 19:50:00+00:00,0.000952,0.951164,-0.308685
3,2019-04-25 20:00:00+00:00,0.000853,0.891165,-0.453679
4,2019-04-25 20:10:00+00:00,0.000605,0.485115,-0.874450
...,...,...,...,...
104787,2021-12-08 14:20:00+00:00,0.003781,0.325898,-0.945405
104788,2021-12-08 14:30:00+00:00,0.003513,0.422935,-0.906160
104789,2021-12-08 14:40:00+00:00,0.003552,0.358694,-0.933455
104790,2021-12-08 14:50:00+00:00,0.003631,0.325898,-0.945405


# Cattalini Meteorologia

In [6]:
cattalini_meteorologia = data['Cattalini_Meteorologia'][['Datetime', 'Velocidade do Vento [m/s]', 'Direção do Vento [°N]']]
cattalini_meteorologia.columns = ['datetime', 'ws', 'wd']

cattalini_meteorologia['sin_wd'] = np.sin(np.deg2rad(cattalini_meteorologia['wd']))
cattalini_meteorologia['cos_wd'] = np.cos(np.deg2rad(cattalini_meteorologia['wd']))

cattalini_meteorologia.drop(columns=['wd'], inplace=True)

X = cattalini_meteorologia[['ws']]
X = normalize(X, axis=0)
cattalini_meteorologia[['ws']] = X

cattalini_meteorologia

Unnamed: 0,datetime,ws,sin_wd,cos_wd
0,2019-04-25 19:40:00+00:00,0.000610,0.951164,-0.308685
1,2019-04-25 19:50:00+00:00,0.000542,0.951164,-0.308685
2,2019-04-25 20:00:00+00:00,0.000486,0.891165,-0.453679
3,2019-04-25 20:10:00+00:00,0.000345,0.485115,-0.874450
4,2019-04-25 20:20:00+00:00,0.000314,0.544932,-0.838480
...,...,...,...,...
594566,2021-12-08 14:56:00+00:00,0.001823,0.573576,-0.819152
594567,2021-12-08 14:57:00+00:00,0.001842,0.529919,-0.848048
594568,2021-12-08 14:58:00+00:00,0.001858,0.484810,-0.874620
594569,2021-12-08 14:59:00+00:00,0.001827,0.422618,-0.906308


# Cattalini Marégrafo

In [7]:
cattalini_maregrafo = data['Cattalini_Maregrafo'][['Datetime', 'Altura [m]']]
cattalini_maregrafo.columns = ['datetime', 'ssh']

X = cattalini_maregrafo[['ssh']]
X = normalize(X, axis=0)
cattalini_maregrafo[['ssh']] = X

cattalini_maregrafo

Unnamed: 0,datetime,ssh
0,2019-04-25 19:30:00+00:00,0.002344
1,2019-04-25 19:35:00+00:00,0.002360
2,2019-04-25 19:40:00+00:00,0.002360
3,2019-04-25 19:45:00+00:00,0.002360
4,2019-04-25 19:50:00+00:00,0.002360
...,...,...
273218,2021-12-08 14:40:00+00:00,0.002806
273219,2021-12-08 14:45:00+00:00,0.002758
273220,2021-12-08 14:50:00+00:00,0.002695
273221,2021-12-08 14:55:00+00:00,0.002647


# Porto Marégrafo

In [8]:
porto_maregrafo = data['Porto_Maregrafo'][['Datetime', 'Altura [m]']]
porto_maregrafo.columns = ['datetime', 'ssh']

X = porto_maregrafo[['ssh']]
X = normalize(X, axis=0)
porto_maregrafo[['ssh']] = X

porto_maregrafo

Unnamed: 0,datetime,ssh
0,2019-04-25 19:39:13+00:00,0.002626
1,2019-04-25 19:49:13+00:00,0.002626
2,2019-04-25 19:59:13+00:00,0.002626
3,2019-04-25 20:09:13+00:00,0.002626
4,2019-04-25 20:19:13+00:00,0.002607
...,...,...
204577,2021-12-08 14:26:16+00:00,0.003395
204578,2021-12-08 14:31:16+00:00,0.003357
204579,2021-12-08 14:36:16+00:00,0.003301
204580,2021-12-08 14:41:16+00:00,0.003226


In [27]:
import joblib
from sklearn.preprocessing import Normalizer

# Cria o transformador Normalizer com norma L2
transformer = Normalizer(norm='l2')

# Aplica a normalização na coluna 'ssh'
X = transformer.fit_transform(porto_maregrafo[['ssh']].T)

# Salva o transformador em um arquivo
joblib.dump(transformer, 'normalizer_l2.joblib')

# Exibe o DataFrame normalizado
porto_maregrafo[['ssh']] = X.T
porto_maregrafo

Unnamed: 0,datetime,ssh
0,2019-04-25 19:39:13+00:00,0.002626
1,2019-04-25 19:49:13+00:00,0.002626
2,2019-04-25 19:59:13+00:00,0.002626
3,2019-04-25 20:09:13+00:00,0.002626
4,2019-04-25 20:19:13+00:00,0.002607
...,...,...
204577,2021-12-08 14:26:16+00:00,0.003395
204578,2021-12-08 14:31:16+00:00,0.003357
204579,2021-12-08 14:36:16+00:00,0.003301
204580,2021-12-08 14:41:16+00:00,0.003226


# Porto Astronômico

In [None]:
porto_astronomica = data['Porto_Astronomica'][['Datetime', 'Altura [m]']]
porto_astronomica.columns = ['datetime', 'ssh']

X = porto_astronomica[['ssh']]
X = normalize(X, axis=0)
porto_astronomica[['ssh']] = X

porto_astronomica

Unnamed: 0,datetime,ssh
0,2019-04-25 19:30:00+00:00,0.002375
1,2019-04-25 19:35:00+00:00,0.002390
2,2019-04-25 19:40:00+00:00,0.002403
3,2019-04-25 19:45:00+00:00,0.002414
4,2019-04-25 19:50:00+00:00,0.002423
...,...,...
275846,2021-12-08 14:40:00+00:00,0.001641
275847,2021-12-08 14:45:00+00:00,0.001652
275848,2021-12-08 14:50:00+00:00,0.001662
275849,2021-12-08 14:55:00+00:00,0.001670


# Porto Harmônico

In [None]:
porto_harmonico = data['Porto_Harmonico']
porto_harmonico.columns = ['datetime', 'ssh']

X = porto_harmonico[['ssh']]
X = normalize(X, axis=0)
porto_harmonico[['ssh']] = X

porto_harmonico

Unnamed: 0,datetime,ssh
0,2019-04-25 19:30:00+00:00,0.000876
1,2019-04-25 19:40:00+00:00,0.000887
2,2019-04-25 19:50:00+00:00,0.000915
3,2019-04-25 20:00:00+00:00,0.000955
4,2019-04-25 20:10:00+00:00,0.001005
...,...,...
137921,2021-12-08 14:20:00+00:00,0.003785
137922,2021-12-08 14:30:00+00:00,0.003688
137923,2021-12-08 14:40:00+00:00,0.003581
137924,2021-12-08 14:50:00+00:00,0.003467


# Sort

In [None]:
code_data = {
    'odas_corrente' : odas_corrente,
    'cattalini_corrente' : cattalini_corrente,
    'odas_meteorologia' : odas_meteorologia,
    'cattalini_meteorologia' : cattalini_meteorologia,
    'cattalini_maregrafo' : cattalini_maregrafo,
    'porto_maregrafo' : porto_maregrafo,
    'porto_astronomica' : porto_astronomica,
    'porto_harmonico' : porto_harmonico,
}

In [None]:
for key, df in code_data.items():
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values(by='datetime')
    code_data[key] = df

# Crop

In [None]:
# Datas de início e fim
start_date = pd.to_datetime("2019-04-25 19:30:00+00:00")
end_date = pd.to_datetime("2021-12-08 15:00:00+00:00")

percent_80_date = start_date + (end_date - start_date) * 0.8
print("Data correspondente a 80% do intervalo:", percent_80_date)

Data correspondente a 80% do intervalo: 2021-05-31 01:30:00+00:00


In [None]:
start_date = pd.to_datetime("2019-04-25 19:30:00+00:00")
end_date = pd.to_datetime("2021-12-08 15:00:00+00:00")
crop_date = pd.to_datetime("2021-05-31 01:30:00+00:00")

In [None]:
train_data = {}
test_data = {}

for key, df in code_data.items():
    train_data[key] = df[df['datetime'] < percent_80_date]
    test_data[key] = df[df['datetime'] >= percent_80_date]

In [None]:
train_path = '/content/drive/MyDrive/Monografia/Dados Paranaguá/train_test_p2mod/train'
test_path = '/content/drive/MyDrive/Monografia/Dados Paranaguá/train_test_p2mod/test'
general_path = '/content/drive/MyDrive/Monografia/Dados Paranaguá/train_test_p2mod'

for key, df in train_data.items():
    df.to_parquet(os.path.join(train_path, key + '.parquet'), index=False)

for key, df in test_data.items():
    df.to_parquet(os.path.join(test_path, key + '.parquet'), index=False)

for key, df in code_data.items():
    df.to_parquet(os.path.join(general_path, key + '.parquet'), index=False)