ALGORITMO PER ANOMALY DETECTION

In [55]:
import numpy as np 
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
 

from sklearn.ensemble import RandomForestRegressor


In [56]:
p2 = pd.read_csv('second_plant.csv')
p1 = pd.read_csv('first_plant.csv')

In [57]:
def anomaly_detection(data):
    df_encoded = pd.get_dummies(data, columns=['source_key_x'])
    df_encoded.drop(columns=['dc_power'],inplace=True)
    
    #changing in format date_time
    df_encoded['date_time'] = pd.to_datetime(df_encoded['date_time'], format='%Y-%m-%d %H:%M:%S')
    df_encoded['date_time'].dtype

    #transforming in timestamps
    df_encoded['date_time'] = df_encoded['date_time'].apply(lambda x: x.timestamp())
    df_encoded['date_time'].dtype
    
    #Setting the variables
    X = df_encoded.drop(['daily_yield','total_yield',], axis=1)
    y = df_encoded['daily_yield'] #the target variable which we are trying to predict is the daily yield
    
    #creation of the training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    #Random forest with 100 trees
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    #Prediction
    y_pred_rf = rf_model.predict(X_test)
    
    #add the column of the prediction to the dataset
    X_test['energy_prediction'] = y_pred_rf
    X_test['daily_yield'] = y_test
    anomalies = pd.DataFrame(columns=X_test.columns)
    
    # Itera su tutte le righe di X_test
    
    for i in range(len(X_test)):
        observation = X_test.iloc[[i]]  # Seleziona la riga corrente
        threshold_positive = observation['energy_prediction'].values[0] + (observation['energy_prediction'].values[0] * 0.30)
        threshold_negative = observation['energy_prediction'].values[0] - (observation['energy_prediction'].values[0] * 0.30)
        # Verifica la condizione per l'anomalia
        if observation['daily_yield'].values[0] > threshold_positive or observation['daily_yield'].values[0] < threshold_negative:
            # Aggiungi la riga anomala al DataFrame anomalies
            anomalies = pd.concat([anomalies, observation], ignore_index=True)
    
    # Stampa il DataFrame delle anomalie
    anomalies['date_time'] = pd.to_datetime(anomalies['date_time'], unit='s')

    #trova il valore dell'inverter
    mask = anomalies.drop(columns=['date_time', 'ac_power', 'ambient_temperature', 'module_temperature', 'irradiation', 'energy_prediction', 'daily_yield'])
    
    # Find the column with the maximum value (i.e., 1) for each row
    mask['inverter_name'] = mask.apply(lambda row: row[row == 1].index[0].replace('source_key_x_', '') if not row[row == 1].empty else None, axis=1)
    
    # Add the 'inverter_name' column to the original DataFrame
    anomalies['inverter_name'] = mask['inverter_name']
    anomaly_detected = anomalies[['date_time', 'ac_power', 'ambient_temperature', 'module_temperature', 'irradiation', 'energy_prediction', 'daily_yield','inverter_name']]
    return anomaly_detected
    
anomaly_detection(p1)

Unnamed: 0,date_time,ac_power,ambient_temperature,module_temperature,irradiation,energy_prediction,daily_yield,inverter_name
0,2020-06-06 06:00:00,13.914286,20.426455,18.254061,0.011280,0.685357,0.285714,ZnxXDlPa8U1GXgE
1,2020-05-15 21:15:00,0.000000,22.119360,20.632086,0.000000,1241.220000,0.000000,ZnxXDlPa8U1GXgE
2,2020-06-01 06:00:00,8.475000,21.278394,19.965544,0.008307,0.350357,0.625000,adLQvlD726eNBSB
3,2020-05-16 06:00:00,5.350000,21.923237,20.256814,0.006206,0.086607,0.000000,WRmjgnKYAwPKWDb
4,2020-05-15 21:30:00,0.000000,22.221250,20.758700,0.000000,383.070000,0.000000,VHMLBKoKgIrUVDU
...,...,...,...,...,...,...,...,...
114,2020-06-06 23:45:00,0.000000,22.123830,20.200745,0.000000,5873.000000,2936.500000,bvBOhCH3iADSZry
115,2020-05-17 06:00:00,0.000000,22.693113,21.416339,0.004785,0.002500,0.000000,VHMLBKoKgIrUVDU
116,2020-06-11 23:45:00,0.000000,22.883351,20.635208,0.000000,5631.523750,2421.857143,wCURE6d3bPkepu2
117,2020-05-25 23:45:00,0.000000,22.384967,19.720331,0.000000,8848.870000,4429.500000,wCURE6d3bPkepu2
