In [None]:
import math
from datetime import datetime, timedelta, timezone

import pandas as pd
from sqlalchemy import MetaData
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session, registry

from NextVisionML.util import get_engine

In [None]:
#SQLAlchemy Setup
engine = get_engine()
metadata = MetaData()
metadata.reflect(bind=engine)
Base = automap_base(metadata=metadata)
Base.prepare(autoload_with=engine)
mapper_registry = registry()

In [None]:
#Loading Data
failures_2016 = pd.read_csv("../data/init/failures-2016.csv", sep=";")
failures_2017 = pd.read_csv("../data/init/failures-2017.csv", sep=";")
metmast_2016 = pd.read_csv("../data//init/metmast-2016.csv", sep=";")
metmast_2017 = pd.read_csv("../data//init/metmast-2017.csv", sep=";")
signals_2016 = pd.read_csv("../data//init/signals-2016.csv", sep=";")
signals_2017 = pd.read_csv("../data//init/signals-2017.csv", sep=";")

In [None]:
# Signale beider Jahre kombinieren
signals = pd.concat([signals_2016, signals_2017])

turbine_names = signals["Turbine_ID"].unique()

def create_df_for_each_turbine(signals):
    turbine_dfs = list()

    for turbine in turbine_names:
        turbine_df = signals[signals["Turbine_ID"] == turbine]
        turbine_df = turbine_df.sort_values("Timestamp")
        turbine_df = turbine_df.reset_index(drop=True)
        turbine_dfs.append(turbine_df)

    return turbine_dfs

turbine_dfs = create_df_for_each_turbine(signals)

#Zusammenführen und sortieren
metmast = pd.concat([metmast_2016, metmast_2017])
metmast = metmast.sort_values("Timestamp")

# drop broken met data
metmast = metmast.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)

# Fill met data
metmast = metmast.fillna(method = "ffill")
metmast = metmast.fillna(method = "bfill")
metmast.isna().sum().sum()

failures = pd.concat([failures_2016, failures_2017])

#Mergen
def JoinMetamast(df:pd.DataFrame):
    df = df.ffill()
    df = df.bfill()
    df = pd.merge(df, metmast, on="Timestamp", how="left")
    df = df.ffill()
    df = df.bfill()
    df.isna().sum().sum()
    return df

merged = list()
for turbine_df in turbine_dfs:
    merged.append(JoinMetamast(turbine_df))
merged_df = pd.concat(merged)

l = list()
l.append(failures[failures["Component"] == "GENERATOR_BEARING"])
l.append(failures[failures["Component"] == "GENERATOR"])

failures_gearbox = pd.concat(l)

failures_gearbox.reset_index(drop=True, inplace=True)

In [None]:
#Util Functions
def get_round_minute_diff(datetime_in: datetime) -> timedelta:
    min = datetime_in.minute
    rounded_min = round(min, -1)
    diff = rounded_min - min
    return timedelta(minutes=diff)

def convert_round_minute_to_time(datetime_in: datetime) -> datetime:
    td = get_round_minute_diff(datetime_in)
    return datetime_in + td

In [None]:
def insert_error(row, failure_turbine, failure_ts:datetime, days_per_class, target_name, risk_levels, data_result):
    data_ts = str(row["Timestamp"])
    row_ = row.copy()
    data_ts = datetime.fromisoformat(data_ts)
    time_before_failure = failure_ts - data_ts         
    if(row[target_name]=="empty" and time_before_failure>timedelta(minutes=1)*-1 and str(row["Turbine_ID"]) == failure_turbine and time_before_failure <= timedelta(days=days_per_class*5)):             
        if(time_before_failure>timedelta(days=days_per_class*4)):            
            row_[target_name] = risk_levels[0]            
        elif(time_before_failure>timedelta(days=days_per_class*3)):
            row_[target_name] = risk_levels[1]
        elif(time_before_failure>timedelta(days=days_per_class*2)):
            row_[target_name] = risk_levels[2]
        elif(time_before_failure>timedelta(days=days_per_class*1)):
            row_[target_name] = risk_levels[3]
        elif(time_before_failure<timedelta(days=days_per_class*1) and time_before_failure>timedelta(minutes=1)*-1):
            row_[target_name] = risk_levels[4]
    data_result.append(row_)
    
failures_gearbox = failures_gearbox.sort_values(by='Timestamp').reset_index(drop=True)
data = merged_df.copy()
data = data.sort_values(by='Timestamp').reset_index(drop=True)

target_name = "Risk Level"
risk_levels = ["low", "low-med", "medium", "med-high", "high",]
days_per_class = 9
data[target_name] = "empty" 

for index, row in failures_gearbox.iterrows():
    data_result = list()
    failure_turbine = str(row["Turbine_ID"])
    failure_ts = str(row["Timestamp"])
    failure_ts = datetime.fromisoformat(failure_ts)
    failure_ts = convert_round_minute_to_time(failure_ts)
    #data.apply(insert_error, args=(failure_turbine, failure_ts, days_per_class, target_name, risk_levels, data_result), axis=1)
    for index, row_ in data.iterrows():
        data_ts = str(row_["Timestamp"])
        data_ts = datetime.fromisoformat(data_ts)
        time_before_failure = failure_ts - data_ts         
        if(row_[target_name]=="empty" and time_before_failure>timedelta(minutes=1)*-1 and str(row_["Turbine_ID"]) == failure_turbine and time_before_failure <= timedelta(days=days_per_class*5)):             
            if(time_before_failure>timedelta(days=days_per_class*4)):            
                row_[target_name] = risk_levels[0]            
            elif(time_before_failure>timedelta(days=days_per_class*3)):
                row_[target_name] = risk_levels[1]
            elif(time_before_failure>timedelta(days=days_per_class*2)):
                row_[target_name] = risk_levels[2]
            elif(time_before_failure>timedelta(days=days_per_class*1)):
                row_[target_name] = risk_levels[3]
            elif(time_before_failure<timedelta(days=days_per_class*1) and time_before_failure>timedelta(minutes=1)*-1):
                row_[target_name] = risk_levels[4]
        data_result.append(row_)
    data = pd.DataFrame(data_result)
    data = data.sort_values(by='Timestamp').reset_index(drop=True)

In [None]:
data[target_name] = data[target_name].apply(lambda x: 'low' if x == 'empty' else x)

In [None]:
data[target_name].describe()

In [None]:
labeled_df  = data

In [None]:
labeled_df[target_name].value_counts()

In [None]:
labeled_df["Turbine_ID"].value_counts()

In [None]:
failures_gearbox[failures_gearbox["Turbine_ID"] == "T06"]

In [None]:
filtered_df = labeled_df.copy()
filtered_df[filtered_df[target_name] == risk_levels[0]] = 0
filtered_df[filtered_df[target_name] == risk_levels[1]] = 1
filtered_df[filtered_df[target_name] == risk_levels[2]] = 2
filtered_df[filtered_df[target_name] == risk_levels[3]] = 3
filtered_df[filtered_df[target_name] == risk_levels[4]] = 4
filtered_df[target_name].plot()

In [None]:
#Alle Daten ab August 2017 liegen im Testset
split_criterion_reg = labeled_df["Timestamp"] >= "2017-07-00T00:00:00+00:00"

test_gearbox = labeled_df[split_criterion_reg].reset_index(drop=True)#.iloc[:100].reset_index(drop=True)
train_gearbox = labeled_df[~split_criterion_reg].reset_index(drop=True)#.iloc[:100].reset_index(drop=True)

In [None]:
filtered_df = test_gearbox.copy()
filtered_df[filtered_df[target_name] == risk_levels[0]] = 0
filtered_df[filtered_df[target_name] == risk_levels[1]] = 1
filtered_df[filtered_df[target_name] == risk_levels[2]] = 2
filtered_df[filtered_df[target_name] == risk_levels[3]] = 3
filtered_df[filtered_df[target_name] == risk_levels[4]] = 4
filtered_df[target_name].plot()

In [None]:
print(train_gearbox.head())

In [None]:
from utils.create_object import create_object
from utils.create_objects import create_objects

In [None]:
context = dict()
context["base"] = Base
context["session"] = Session(bind=engine)

data_meta = create_object(context, "data_meta", with_commit=True)

groupings = ["train", "test"]
datapoint_mappings = {}
for i, grouping_val in enumerate(groupings):
    datapoint_mappings[grouping_val] = create_object(context, "datapoint_mappings", with_commit=True,
        data_meta_id = 1,
        grouping = grouping_val
    )

label_names = ["Risk Level"]
label_entries = {}
for i, label in enumerate(label_names):
    label_entries[label] = create_object(context, "label", with_commit=True,
        data_meta_id = 1,
        name = label,
        description = "TBD"
    )
    
label_values = {}
for i, label in enumerate(label_names):
    label_values[label] = {}
    label_values_ = train_gearbox[label].unique()
    for j, label_value in enumerate(label_values_):
        label_values[label][label_value] = create_object(context, "label_categorical", with_commit=True,
            label_id = label_entries[label].id,
            category = label_value,
            description = "TBD"
        )

time_name = "Timestamp"
filter_names = ["Turbine_ID"]
meta_info_names = filter_names + [time_name]
feature_names = [feature for feature in train_gearbox.columns if feature not in label_names and feature not in meta_info_names]
aggregated_meta_feature_list = meta_info_names + feature_names
aggregated_meta_feature_list = [feature for feature in aggregated_meta_feature_list]
feature_entries = {}
filter_entries = {}
for i, name in enumerate(aggregated_meta_feature_list):
    type_ = "data"
    if name==time_name:
        time = create_object(context, "time", with_commit=True,
            data_meta_id = 1,
            name = name,
            description = "TBD"
        )
    elif name in filter_names:
        filter = create_object(context, "filter", with_commit=True,
            data_meta_id = 1,
            name = name,
            description = "TBD"
        )
        filter_entries[name] = filter
    else:
        feature = create_object(context, "feature", with_commit=True,
            data_meta_id = 1,
            name = name,
            type_ = type_,
            description = "TBD"
        )
        feature_entries[name] = feature

def process_row(row, context, datapoint_id, mapping_id, feature_entries, filter_entries):
    create_object(context, "datapoint", with_commit = True,
                              id=datapoint_id,
                              datapoint_mappings_id=mapping_id,
                              datetime=datetime.strptime(row[time_name][:19], "%Y-%m-%dT%H:%M:%S"))

    filter_values = [{'datapoint_id': datapoint_id, 'filter_id': filter_entries[key].id, 'value': value} 
                    for key, value in row.items() if key in filter_entries.keys()]

    # Process features
    feature_values = [{'datapoint_id': datapoint_id, 'feature_id': feature_entries[key].id, 'value': float(value)} 
                      for key, value in row.items() if key in feature_entries.keys()]
    
    create_object(context, "datapoint_class_label", with_commit=True, datapoint_id=datapoint_id, label_id=label_entries[label_names[0]].id, value=row[label_names[0]])

    # Batch create datapoint_feature_value objects
    create_objects(context, "datapoint_feature_value",feature_values)

    # # Create datapoint_filter object
    create_objects(context, "datapoint_filter", filter_values)

def apply_to_row(row, args):
    mapping_id = int(args)
    # base datapoint ID on row id and the mapping id and make sure datapoint_ids are not overlapping with big integer
    datapoint_id = (mapping_id-1) * 1000000 + int(row.name) + 1
    process_row(row, context, datapoint_id, mapping_id, feature_entries, filter_entries)
    # commit every 5000 rows to prevent memory overflow
    if (int(row.name) % 5000 == 0 and int(row.name) != 0):
        context["session"].commit()
# Main loop
for grouping, dataset in {"test": test_gearbox, "train": train_gearbox}.items():
    # Apply the function to each row
    dataset.apply(apply_to_row, args=(datapoint_mappings[grouping].id,), axis=1)

# for all remaining datapoints
context["session"].commit()
