Tratar CSV

In [4]:
import datetime
import pandas as pd
import re

def corrigir_diferenca_tempo(dataframe):
    dataframe['datetime'] = pd.to_datetime(dataframe['time'], unit='s')

    time_diff = dataframe['datetime'].diff()
    time_diff = time_diff.fillna(pd.Timedelta(seconds=0))
    time_diff = time_diff.apply(lambda x: pd.Timedelta(minutes=1) if x.total_seconds() == 1 else x)

    dataframe['datetime'] = dataframe['datetime'] + time_diff.cumsum()

    dataframe['datetime'] = dataframe['datetime'].apply(lambda dt: dt.replace(second=0))
    
    return dataframe

def timestamp_para_datahora(timestamp):
    datahora = datetime.datetime.fromtimestamp(timestamp)
    return datahora

def merge_dataframe(df, group_size=60):
    # Cria o dataframe auxiliar
    df_merged = pd.DataFrame()

    # Loop pelos grupos de tamanho "group_size"
    for i in range(0, len(df), group_size):
        # Seleciona o grupo atual
        df_group = df.iloc[i:i+group_size,:]

        # Loop pelas colunas do grupo
        for col in df_group.columns:

            # Verifica se a coluna é a datetime
            if col == 'datetime':
                # Armazena o valor da primeira linha da coluna
                col_value = df_group[col].iloc[0]

                # Ajusta apenas a hora mantendo os minutos
                col_value = col_value.replace(minute=0, second=0)

            else:
                # Armazena o valor da primeira linha da coluna
                col_value = df_group[col].iloc[0]

                # Loop pelas linhas da coluna
                for j in range(1, len(df_group)):

                    # Verifica se o valor atual é maior ou menor que o valor armazenado
                    if df_group[col].iloc[j] > col_value:
                        # Soma a diferença na variável armazenada
                        col_value += df_group[col].iloc[j] - col_value
                        
                    elif df_group[col].iloc[j] < col_value:
                        # Soma a diferença na variável armazenada
                        col_value -= col_value - df_group[col].iloc[j]

            # Insere a coluna e valor no dataframe auxiliar
            df_merged.loc[i//group_size, col] = col_value

    return df_merged



def formatar_dataframe(dataframe: pd.DataFrame) -> pd.DataFrame:
    # Converter a coluna "time" para valores numéricos e filtrar os valores nulos e não numéricos
    dataframe['time'] = pd.to_numeric(dataframe['time'], errors='coerce')
    dataframe = dataframe[~dataframe['time'].isna()]

    # Converter a coluna "time" para timestamp e criar a coluna "datetime"
    dataframe = corrigir_diferenca_tempo(dataframe)

    # Apagar a coluna "time", "summary", "icon", "cloudCover"
    dataframe = dataframe.drop(columns=['time', 'summary', 'icon', 'cloudCover'])

    # Ordena alfabeticamente as colunas
    dataframe = dataframe.sort_index(axis=1)

    return merge_dataframe(dataframe)

Salvar CSV Tratado

In [5]:
import pandas as pd

filepath = '../Files/HomeCTratado3.csv'
dataframe = formatar_dataframe(pd.read_csv('../Files/HomeC.csv', delimiter=',', low_memory=False))

dataframe.to_csv(filepath, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['datetime'] = pd.to_datetime(dataframe['time'], unit='s')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['datetime'] = dataframe['datetime'] + time_diff.cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['datetime'] = dataframe['datetime'].apply(lambda dt: dt.repl

Treinar modelo

In [2]:
from sklearn.ensemble import IsolationForest
import pandas as pd

#Abrir CSV ja tratado
df = pd.read_csv('../Files/HomeCTratado.csv', delimiter=',', low_memory=False)

#Colunas que não envolvem KW
exclude_columns = ['apparentTemperature','datetime','dewPoint','humidity','precipIntensity','precipProbability','pressure','temperature','visibility','windBearing','windSpeed']

#Salvar todos os dados do dataframe menos as colunas acima
X = df[[column for column in list(df.columns) if column not in exclude_columns]]

#Definir padrões pro algoritmo
isolation_forest = IsolationForest(n_estimators=100, contamination='auto')

#Treinar o algoritmo
isolation_forest.fit(X)

#Achar as anomalias
y_pred = isolation_forest.predict(X)

#Adicionar nova coluna dizendo se é ou não anomalia
df['anomaly'] = y_pred

#Dataframe somente com anomalias
anomaly = df.loc[df['anomaly'] == -1]

anomaly

Unnamed: 0,Barn [kW],Dishwasher [kW],Fridge [kW],Furnace 1 [kW],Furnace 2 [kW],Garage door [kW],Home office [kW],House overall [kW],Kitchen 12 [kW],Kitchen 14 [kW],...,humidity,precipIntensity,precipProbability,pressure,temperature,use [kW],visibility,windBearing,windSpeed,anomaly
18,0.035883,0.000050,0.593800,0.486150,0.658983,0.012333,0.041833,5.359650,0.000617,0.000650,...,0.70,0.0000,0.00,1013.62,32.87,5.359650,8.06,273.0,9.14,-1
23,0.034200,1.231283,0.136100,0.492267,0.680150,0.011467,0.051483,2.908333,0.000467,0.000833,...,0.60,0.0000,0.00,1014.19,30.04,2.908333,10.00,275.0,8.02,-1
24,0.034550,1.366750,0.003850,0.493400,0.071683,0.011017,0.051950,2.094050,0.000933,0.001317,...,0.60,0.0000,0.00,1014.24,29.56,2.094050,10.00,273.0,8.04,-1
41,0.493183,0.000017,0.037283,0.316250,0.676550,0.012267,0.090567,2.772533,0.000850,0.000067,...,0.57,0.0000,0.00,1014.28,33.22,2.772533,10.00,244.0,6.59,-1
66,0.029383,0.000083,0.017833,0.021367,0.653133,0.013300,0.171517,2.207400,0.000633,0.000233,...,0.69,0.0000,0.00,1010.23,32.80,2.207400,10.00,217.0,4.14,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8219,0.029017,1.349867,0.122150,0.150917,0.641850,0.011883,0.041233,3.015117,0.000467,0.000717,...,0.64,0.0000,0.00,1023.87,34.59,3.015117,10.00,310.0,8.74,-1
8220,0.029283,1.346433,0.124800,0.017933,0.073583,0.011533,0.041617,2.313217,0.000617,0.001100,...,0.67,0.0000,0.00,1024.11,32.41,2.313217,9.62,306.0,5.83,-1
8246,0.028950,1.345633,0.003833,0.476500,0.073367,0.010867,0.041183,1.784033,0.000900,0.001300,...,0.73,0.0047,0.20,1022.22,36.61,1.784033,5.55,197.0,10.79,-1
8247,0.028883,1.336417,0.003317,0.634250,0.072383,0.010817,0.556333,2.668050,0.000983,0.001683,...,0.87,0.0164,0.57,1021.29,35.34,2.668050,3.64,186.0,8.85,-1
