In [1]:
# data manipulation
import pandas as pd
import numpy as np
import scipy.io.arff

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [207]:
# carregando o dataset
data, meta = scipy.io.arff.loadarff(f'data/electricity-normalized.arff')
df = pd.DataFrame(data)

# Convertendo byte-strings para strings
str_df = df.select_dtypes([object]).stack().str.decode('utf-8').unstack()
for col in str_df:
    df[col] = str_df[col]

# ajeitar o tipo dos dados
nominal_cols = [col for col, dtype in zip(meta.names(), meta.types()) if dtype == 'nominal']
for col in nominal_cols:
    df[col] = df[col].astype('category')

df.head()

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
0,0.0,2,0.0,0.056443,0.439155,0.003467,0.422915,0.414912,UP
1,0.0,2,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,UP
2,0.0,2,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,UP
3,0.0,2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912,UP
4,0.0,2,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,DOWN


##Criação de novas colunas a partir das colunas originais

In [204]:
start_date = '1996-05-07'
samples_daily = 48
sample_count = len(df)
interval = pd.Timedelta(days=1) / samples_daily

df['day'] = df.apply(lambda row: int(row['day']) , axis=1)

#calcula primeiro a data hora de cada linha com base no dia inicial
df['datetime'] = pd.date_range(start=start_date, periods=sample_count, freq=interval)

df['month'] = df.apply(lambda row: row['datetime'].month , axis=1)

df['is_weekend'] = df.apply(lambda row: row['day'] == 6 or row['day'] == 7, axis=1)

df['price_ratio'] = df.apply(lambda row: row['vicprice']/ (row['nswprice']+0.00001) , axis=1)

df['vic_rolling_average'] = df['vicprice'].shift(1).rolling(window=48, min_periods=0).mean()
df['vic_class'] = df.apply(lambda row: 'UP' if row['vicprice'] > row['vic_rolling_average'] else 'DOWN', axis=1)

df = df.drop('datetime', axis=1)
df = df.drop('vic_rolling_average', axis=1)

new_order = [
    'date',
    'day',
    'period',
    'month',
    'is_weekend',
    'nswprice',
    'nswdemand',
    'vicprice',
    'vicdemand',
    'transfer',
    'price_ratio',
    'vic_class',
    'class',
]

df = df[new_order]

df.head(100)

Unnamed: 0,date,day,period,month,is_weekend,nswprice,nswdemand,vicprice,vicdemand,transfer,price_ratio,vic_class,class
0,0.000000,2,0.000000,5,False,0.056443,0.439155,0.003467,0.422915,0.414912,0.061414,DOWN,UP
1,0.000000,2,0.021277,5,False,0.051699,0.415055,0.003467,0.422915,0.414912,0.067048,DOWN,UP
2,0.000000,2,0.042553,5,False,0.051489,0.385004,0.003467,0.422915,0.414912,0.067322,DOWN,UP
3,0.000000,2,0.063830,5,False,0.045485,0.314639,0.003467,0.422915,0.414912,0.076206,DOWN,UP
4,0.000000,2,0.085106,5,False,0.042482,0.251116,0.003467,0.422915,0.414912,0.081592,DOWN,DOWN
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000044,3,1.000000,5,False,0.054642,0.449866,0.003467,0.422915,0.414912,0.063438,DOWN,UP
96,0.000088,4,0.000000,5,False,0.054642,0.448230,0.003467,0.422915,0.414912,0.063438,DOWN,UP
97,0.000088,4,0.021277,5,False,0.054642,0.429485,0.003467,0.422915,0.414912,0.063438,DOWN,UP
98,0.000088,4,0.042553,5,False,0.051489,0.399286,0.003467,0.422915,0.414912,0.067322,DOWN,UP


##One-hot encoding da coluna "day"

In [208]:
df = pd.get_dummies(df, columns=['day'], prefix='day')

df.rename(columns={
    'day_1': 'is_monday',
    'day_2': 'is_tuesday',
    'day_3': 'is_wednesday',
    'day_4': 'is_thursday',
    'day_5': 'is_friday',
    'day_6': 'is_saturday',
    'day_7': 'is_sunday',
}, inplace=True)

df.head(100)

Unnamed: 0,date,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class,is_monday,is_tuesday,is_wednesday,is_thursday,is_friday,is_saturday,is_sunday
0,0.000000,0.000000,0.056443,0.439155,0.003467,0.422915,0.414912,UP,False,True,False,False,False,False,False
1,0.000000,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,UP,False,True,False,False,False,False,False
2,0.000000,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,UP,False,True,False,False,False,False,False
3,0.000000,0.063830,0.045485,0.314639,0.003467,0.422915,0.414912,UP,False,True,False,False,False,False,False
4,0.000000,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,DOWN,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000044,1.000000,0.054642,0.449866,0.003467,0.422915,0.414912,UP,False,False,True,False,False,False,False
96,0.000088,0.000000,0.054642,0.448230,0.003467,0.422915,0.414912,UP,False,False,False,True,False,False,False
97,0.000088,0.021277,0.054642,0.429485,0.003467,0.422915,0.414912,UP,False,False,False,True,False,False,False
98,0.000088,0.042553,0.051489,0.399286,0.003467,0.422915,0.414912,UP,False,False,False,True,False,False,False
