In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import time
from scipy.fft import fft, ifft
from scipy.interpolate import interp1d
from statsmodels.tsa.stattools import acf,pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
renamed_columns = ['precipitacao total,horario (mm)','pressao atmosferica ao nivel da estacao (mb)','pressao atmosferica max. na hora ant. (aut) (mb)','pressao atmosferica min. na hora ant. (aut) (mb)','radiation (kj/m2)','temperatura do ar - bulbo seco (°c)','temperatura do ponto de orvalho (°c)','temperatura maxima na hora ant. (aut) (°c)','temperatura minima na hora ant. (aut) (°c)','temperatura orvalho max. na hora ant. (aut) (°c)','temperatura orvalho min. na hora ant. (aut) (°c)','umidade rel. max. na hora ant. (aut) (%)','umidade rel. min. na hora ant. (aut) (%)','umidade relativa do ar, horaria (%)','vento direcao horaria (gr) (° (gr))','vento rajada maxima (m/s)','vento velocidade horaria (m/s)']
renamed_columns_en = ['date','hour','total precipitation (mm)','pressao atmosferica ao nivel da estacao (mb)','atmospheric pressure max. in the previous hour (mb)','atmospheric pressure min. in the previous hour (mb)','radiation (kj/m2)','air temperature - dry bulb (°c)','dew point temperature (°c)','max. temperature in the previous hour (°c)','min. temperature in the previous hour (°c)','dew temperature max. in the previous hour (°c)','dew temperature min. in the previous hour (°c)','relative humidity max. in the previous hour (%)','relative humidity min. in the previous hour (%)','air relative humidity (%)','wind direction (° (gr))','wind rajada maxima (m/s)','wind speed (m/s)','region','state','station','station_code','latitude','longitude','height']
abbreviation = ['level_0','index','date','hour','prcp', 'stp', 'smax', 'smin','gbrd','temp','dewp','tmax','tmin','dmax','dmin','hmax','hmin','hmdy','wdct', 'gust', 'wdsp', 'regi','prov','wsnm','inme','lat','lon','elvt']
abb_1=['prcp', 'stp', 'smax', 'smin','gbrd','temp','dewp','tmax','tmin','dmax','dmin','hmax','hmin','hmdy','wdct', 'gust', 'wdsp']


def clean_na(df, na_value = -9999):
    df = df.replace(to_replace=na_value,value=np.NaN)
    df = df[df.iloc[:,1].first_valid_index():]
    df = df.fillna(method='ffill')
    return df

def make_dataset(df):
    df = clean_na(df)
    return df.reset_index()

In [5]:
bdf=pd.read_csv('north.csv')
df=make_dataset(bdf)
df.columns=abbreviation
df.head()

  df = df.fillna(method='ffill')


Unnamed: 0,level_0,index,date,hour,prcp,stp,smax,smin,gbrd,temp,...,wdct,gust,wdsp,regi,prov,wsnm,inme,lat,lon,elvt
0,0,0,2000-05-09,00:00,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
1,1,1,2000-05-09,01:00,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
2,2,2,2000-05-09,02:00,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
3,3,3,2000-05-09,03:00,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
4,4,4,2000-05-09,04:00,,,,,,,...,,,,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25


In [12]:
for x in abb_1:
    df[x].fillna(df[x].mean(), inplace=True)
df.drop('index', axis=1, inplace=True)
df.head()

Unnamed: 0,date,hour,prcp,stp,smax,smin,gbrd,temp,dewp,tmax,...,wdct,gust,wdsp,regi,prov,wsnm,inme,lat,lon,elvt
0,2000-05-09,00:00,0.292075,992.553134,992.900493,992.211504,856.820534,26.517484,21.325548,27.174332,...,155.188355,3.662307,1.282836,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
1,2000-05-09,01:00,0.292075,992.553134,992.900493,992.211504,856.820534,26.517484,21.325548,27.174332,...,155.188355,3.662307,1.282836,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
2,2000-05-09,02:00,0.292075,992.553134,992.900493,992.211504,856.820534,26.517484,21.325548,27.174332,...,155.188355,3.662307,1.282836,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
3,2000-05-09,03:00,0.292075,992.553134,992.900493,992.211504,856.820534,26.517484,21.325548,27.174332,...,155.188355,3.662307,1.282836,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25
4,2000-05-09,04:00,0.292075,992.553134,992.900493,992.211504,856.820534,26.517484,21.325548,27.174332,...,155.188355,3.662307,1.282836,N,AM,MANAUS,A101,-3.103333,-60.016389,61.25


In [None]:
df['date']=pd.to_datetime(df['date']).apply(lambda a: a.timestamp())
df['hour']=pd.to_datetime(df['hour'], format='%H:%M').apply(lambda b: b.hour*3600 + b.minute*60)
cat_cols=['regi', 'prov', 'wsnm', 'inme']
encoder=OneHotEncoder(sparse_output=False)
arr_encoded=encoder.fit_transform(df[cat_cols])
df_encoded=pd.DataFrame(arr_encoded, columns=encoder.get_feature_names_out(cat_cols))
df_encoded=pd.concat([df, df_encoded], axis=1)
df_encoded=df_encoded.drop(cat_cols, axis=1)
df_encoded.head()

In [None]:
scaler=StandardScaler()
for cols in df.columns:
    df[cols]=scaler.fit_transform(df[cols])