In [1]:
import os

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

from dotenv import load_dotenv
load_dotenv()

import mlflow

mlflow.set_tracking_uri(os.getenv('MLFLOW_URL'))

import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBRegressor

In [33]:
def load_failures() -> pd.DataFrame:
    """Load the failures from the wind farm dataset."""
    df = pd.read_csv('../data/raw/htw-failures-2016.csv', sep=';')
    aux = pd.read_csv('../data/raw/htw-failures-2017.csv', sep=';')

    df = pd.concat([df, aux], axis=0).reset_index(drop=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.set_index('Timestamp').sort_index()
    df = df[df.Turbine_ID != 'T09']
    df['Turbine_ID'] = df['Turbine_ID'].apply(lambda x: int(x[1:]))
    return df

def trb_per_failures() -> dict:
    """Return a dictionary with the turbine ID per component failure"""
    failures = load_failures()
    trb_per_comp = {}
    for comp in failures.Component.unique():
        trb_per_comp[comp] = failures[(failures.Component == comp)&(failures.index >= '2017-06-01')].Turbine_ID.unique().tolist()
    return trb_per_comp

def load_costs() -> pd.DataFrame:
    """Load the costs from the wind farm dataset."""
    return pd.read_csv('../data/raw/HTW_Costs.csv').set_index('Component')

In [169]:
experiment_id = mlflow.search_experiments(filter_string=f"name = '{os.environ['MLFLOW_EXPERIMENT_NAME']}'")[0].experiment_id
runs = mlflow.search_runs(experiment_ids=experiment_id, filter_string="tags.MODULE = 'hackathon'")

In [170]:
ind_model = []
for run in runs['tags.mlflow.runName'].unique():
    aux = runs[runs['tags.mlflow.runName'] == run]
    ind_model.append([run, aux['metrics.total_cost'].max()])

ind_model = pd.DataFrame(ind_model, columns=['run', 'total_cost'])
ind_model = ind_model.sort_values(by='run').reset_index(drop=True)
print(ind_model)

                                       run    total_cost
0         hackathon_trb11_Hyd_Oil_Temp_Avg  15233.101852
1   hackathon_trb1_HVTrafo_Phase1_Temp_Avg  27718.217593
2   hackathon_trb1_HVTrafo_Phase2_Temp_Avg  32934.884259
3   hackathon_trb1_HVTrafo_Phase3_Temp_Avg  30351.550926
4        hackathon_trb6_Gear_Bear_Temp_Avg  59430.864198
5         hackathon_trb6_Gear_Oil_Temp_Avg  79875.308642
6          hackathon_trb6_Hyd_Oil_Temp_Avg   7835.331790
7        hackathon_trb7_Gen_Bear2_Temp_Avg  14244.135802
8         hackathon_trb7_Gen_Bear_Temp_Avg  14292.746914
9       hackathon_trb7_Gen_Phase1_Temp_Avg  27932.986111
10      hackathon_trb7_Gen_Phase2_Temp_Avg  27932.986111
11      hackathon_trb7_Gen_Phase3_Temp_Avg  27932.986111
12    hackathon_trb7_Gen_SlipRing_Temp_Avg  37432.986111
13         hackathon_trb7_Hyd_Oil_Temp_Avg  13360.200617


In [171]:
costs = load_costs()
for comp in costs.index:
    print(f"Maximum possible benefit of {comp} model: ", costs.loc[comp, "Replacement_Cost"] - costs.loc[comp, "Repair_Cost"])

print('-'*50)
maximum_possible_benefit = costs["Replacement_Cost"].sum() + costs.loc["HYDRAULIC_GROUP", "Replacement_Cost"]*2 - costs["Repair_Cost"].sum() - costs.loc["HYDRAULIC_GROUP", "Repair_Cost"]*2
print("Maximum possible benefit of hackathon models: ", maximum_possible_benefit)

Maximum possible benefit of GEARBOX model:  80000
Maximum possible benefit of GENERATOR model:  45000
Maximum possible benefit of GENERATOR_BEARING model:  17500
Maximum possible benefit of TRANSFORMER model:  46500
Maximum possible benefit of HYDRAULIC_GROUP model:  17000
--------------------------------------------------
Maximum possible benefit of hackathon models:  240000


In [172]:
targets = {
    'GENERATOR': ['Gen_Phase1_Temp_Avg','Gen_Phase2_Temp_Avg','Gen_Phase3_Temp_Avg','Gen_SlipRing_Temp_Avg',],
    'HYDRAULIC_GROUP': ['Hyd_Oil_Temp_Avg'],
    'GENERATOR_BEARING': ['Gen_Bear_Temp_Avg','Gen_Bear2_Temp_Avg'],
    'TRANSFORMER': ['HVTrafo_Phase1_Temp_Avg','HVTrafo_Phase2_Temp_Avg','HVTrafo_Phase3_Temp_Avg'],
    'GEARBOX': ['Gear_Oil_Temp_Avg', 'Gear_Bear_Temp_Avg']
    }
df = [] 
for key in trb_per_failures().keys():
    for id in trb_per_failures()[key]:
        aux = runs[(runs['tags.col'].isin(targets[key]))&(runs['tags.trb_num']==str(id))]
        if aux.empty:
            continue
        df.append([key, id, aux['metrics.total_cost'].max(), aux['tags.col'][aux['metrics.total_cost'].idxmax()]])

df = pd.DataFrame(df, columns=['Component', 'Turbine_ID', 'Cost', 'Variable'])
df['Turbine_ID'] = df['Turbine_ID'].astype(int)
df = df.sort_values(by=['Turbine_ID', 'Component']).reset_index(drop=True)
df['Model performance'] = (1 - (df.Component.map((costs['Replacement_Cost'] - costs['Repair_Cost']).to_dict()) - df.Cost) / df.Component.map((costs['Replacement_Cost'] - costs['Repair_Cost']).to_dict())) *100
print(df)
print('-'*50)
print('Total', df['Cost'].sum())
print('Percentage of total cost respect to maximum possible benefit', round(df['Cost'].sum()/maximum_possible_benefit*100, 2), '%')


           Component  Turbine_ID          Cost                 Variable  \
0        TRANSFORMER           1  32934.884259  HVTrafo_Phase2_Temp_Avg   
1            GEARBOX           6  79875.308642        Gear_Oil_Temp_Avg   
2    HYDRAULIC_GROUP           6   7835.331790         Hyd_Oil_Temp_Avg   
3          GENERATOR           7  37432.986111    Gen_SlipRing_Temp_Avg   
4  GENERATOR_BEARING           7  14292.746914        Gen_Bear_Temp_Avg   
5    HYDRAULIC_GROUP           7  13360.200617         Hyd_Oil_Temp_Avg   
6    HYDRAULIC_GROUP          11  15233.101852         Hyd_Oil_Temp_Avg   

   Model performance  
0          70.827708  
1          99.844136  
2          46.090187  
3          83.184414  
4          81.672840  
5          78.589415  
6          89.606481  
--------------------------------------------------
Total 200964.56018518523
Percentage of total cost respect to maximum possible benefit 83.74 %


In [173]:
winners = {}
res = {}
for key in ind_model.run.values:
    results = runs[(runs['tags.mlflow.runName'] == key)]
    results = results[results['metrics.total_cost'] == results['metrics.total_cost'].max()]
    aux = results[['run_id', 'artifact_uri', 'metrics.total_cost'] + [x for x in results.columns if 'params' in x]]
    print('Model name: ', key, '->', aux['metrics.total_cost'].mean())
    if len(aux) == 1:
        print('Winner: ', results['run_id'].item())
        winners[key] = results['artifact_uri'].item()
        res[key] = results[[x for x in aux.columns if 'params' in x]].iloc[0].to_dict()
    else:
        # I am selecting the best model based on high resample_time and low n_estimators
        if len(aux['params.resample_time'].unique()) > 1 and len(aux['params.n_estimators'].unique()) > 1:
            aux = aux.sort_values(by=['params.resample_time', 'params.n_estimators'], ascending=[True, False])
            print('Winner: ', aux['run_id'].iloc[0])
            winners[key] = aux['artifact_uri'].iloc[0]
            res[key] = aux[[x for x in aux.columns if 'params' in x]].iloc[0].to_dict()
        elif len(aux['params.resample_time'].unique()) > 1:
            aux = aux.sort_values(by=['params.resample_time'], ascending=[True])
            print('Winner: ', aux['run_id'].iloc[0])
            winners[key] = aux['artifact_uri'].iloc[0]
            res[key] = aux[[x for x in aux.columns if 'params' in x]].iloc[0].to_dict()
        elif len(aux['params.n_estimators'].unique()) > 1:
            aux = aux.sort_values(by=['params.n_estimators'], ascending=[False])
            print('Winner: ', aux['run_id'].iloc[0])
            winners[key] = aux['artifact_uri'].iloc[0]
            res[key] = aux[[x for x in aux.columns if 'params' in x]].iloc[0].to_dict()
        else:
            print('Winner: ', aux['run_id'].iloc[0])
            winners[key] = aux['artifact_uri'].iloc[0]
            res[key] = aux[[x for x in aux.columns if 'params' in x]].iloc[0].to_dict()
    print('')

Model name:  hackathon_trb11_Hyd_Oil_Temp_Avg -> 15233.101851851852
Winner:  d094dc2b1e3a4321b1c1d4feb081e8c9

Model name:  hackathon_trb1_HVTrafo_Phase1_Temp_Avg -> 27718.21759259259
Winner:  04cc0fe4ceb742d8aabd207a5ab4a080

Model name:  hackathon_trb1_HVTrafo_Phase2_Temp_Avg -> 32934.88425925926
Winner:  e95002c393024e0f9ba0011c30e944d9

Model name:  hackathon_trb1_HVTrafo_Phase3_Temp_Avg -> 30351.550925925927
Winner:  753c538aa6864acda09a5ca1d803cf1c

Model name:  hackathon_trb6_Gear_Bear_Temp_Avg -> 59430.86419753086
Winner:  bf402daeb786441ea74051c233d92cd0

Model name:  hackathon_trb6_Gear_Oil_Temp_Avg -> 79875.30864197531
Winner:  62283ff8f772494a88a1ded2ba6002a9

Model name:  hackathon_trb6_Hyd_Oil_Temp_Avg -> 7835.331790123458
Winner:  4b06be1124534f4eb4a0dc230d3b2187

Model name:  hackathon_trb7_Gen_Bear2_Temp_Avg -> 14244.135802469133
Winner:  b111556f579a44c28023c5211c4db7b3

Model name:  hackathon_trb7_Gen_Bear_Temp_Avg -> 14292.746913580244
Winner:  35e81a01f66d4c349c7c4

In [174]:
for winner, artifact_uri in winners.items():
    image = mlflow.artifacts.load_image(artifact_uri + "/check.png")
    image.save(f"../reports/winners/{winner}.png")

In [175]:
df = pd.DataFrame(res)
df.to_parquet('../data/processed/best_xgb_models_parameters.parquet')

In [176]:
df

Unnamed: 0,hackathon_trb11_Hyd_Oil_Temp_Avg,hackathon_trb1_HVTrafo_Phase1_Temp_Avg,hackathon_trb1_HVTrafo_Phase2_Temp_Avg,hackathon_trb1_HVTrafo_Phase3_Temp_Avg,hackathon_trb6_Gear_Bear_Temp_Avg,hackathon_trb6_Gear_Oil_Temp_Avg,hackathon_trb6_Hyd_Oil_Temp_Avg,hackathon_trb7_Gen_Bear2_Temp_Avg,hackathon_trb7_Gen_Bear_Temp_Avg,hackathon_trb7_Gen_Phase1_Temp_Avg,hackathon_trb7_Gen_Phase2_Temp_Avg,hackathon_trb7_Gen_Phase3_Temp_Avg,hackathon_trb7_Gen_SlipRing_Temp_Avg,hackathon_trb7_Hyd_Oil_Temp_Avg
params.min_child_weight,3,5,1,5,5,3,5,5,1,5,5,5,1,5
params.center,median,median,median,median,median,median,median,median,median,median,median,median,median,median
params.num_dev,3,3,3,3,3,3,3,3,3,3,3,3,3,3
params.n_estimators,500,500,50,50,500,100,100,50,50,100,100,100,50,200
params.tree_method,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist,gpu_hist
params.deviation,mad,std,std,mad,std,std,mad,mad,std,std,std,std,std,mad
params.learning_rate,0.01,0.01,0.05,0.01,0.01,0.1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
params.max_depth,5,7,9,3,7,3,7,9,3,9,9,9,5,3
params.resample_time,24,12,6,24,24,12,6,24,6,6,6,6,6,12
