In [None]:
import math
from datetime import datetime, timedelta, timezone

import pandas as pd
from sqlalchemy import MetaData
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session, registry

from utils.handle_engine import get_engine

In [None]:
#SQLAlchemy Setup
engine = get_engine()
metadata = MetaData()
metadata.reflect(bind=engine)
Base = automap_base(metadata=metadata)
Base.prepare(autoload_with=engine)
mapper_registry = registry()

In [None]:
#Loading Data
failures_2016 = pd.read_csv("../data/failures-2016.csv", sep=";")
failures_2017 = pd.read_csv("../data/failures-2017.csv", sep=";")
metmast_2016 = pd.read_csv("../data/metmast-2016.csv", sep=";")
metmast_2017 = pd.read_csv("../data/metmast-2017.csv", sep=";")
signals_2016 = pd.read_csv("../data/signals-2016.csv", sep=";")
signals_2017 = pd.read_csv("../data/signals-2017.csv", sep=";")

In [None]:
# Signale beider Jahre kombinieren
signals = pd.concat([signals_2016, signals_2017])

turbine_names = signals["Turbine_ID"].unique()

def create_df_for_each_turbine(signals):
    turbine_dfs = list()

    for turbine in turbine_names:
        turbine_df = signals[signals["Turbine_ID"] == turbine]
        turbine_df = turbine_df.sort_values("Timestamp")
        turbine_df = turbine_df.reset_index(drop=True)
        turbine_dfs.append(turbine_df)

    return turbine_dfs

turbine_dfs = create_df_for_each_turbine(signals)

#Zusammenführen und sortieren
metmast = pd.concat([metmast_2016, metmast_2017])
metmast = metmast.sort_values("Timestamp")

# drop broken met data
metmast = metmast.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)

# Fill met data
metmast = metmast.fillna(method = "ffill")
metmast = metmast.fillna(method = "bfill")
metmast.isna().sum().sum()

failures = pd.concat([failures_2016, failures_2017])

#Mergen
def JoinMetamast(df:pd.DataFrame):
    df = df.ffill()
    df = df.bfill()
    df = pd.merge(df, metmast, on="Timestamp", how="left")
    df = df.ffill()
    df = df.bfill()
    df.isna().sum().sum()
    return df

merged = list()
for turbine_df in turbine_dfs:
    merged.append(JoinMetamast(turbine_df))
merged_df = pd.concat(merged)

failures_gearbox = failures[failures["Component"] == "GEARBOX"]
failures_gearbox.reset_index(drop=True, inplace=True)

In [None]:
#Util Functions
def get_round_minute_diff(datetime_in: datetime) -> timedelta:
    min = datetime_in.minute
    rounded_min = round(min, -1)
    diff = rounded_min - min
    return timedelta(minutes=diff)

def convert_round_minute_to_time(datetime_in: datetime) -> datetime:
    td = get_round_minute_diff(datetime_in)
    return datetime_in + td

In [None]:
days_lookback = 90
mins_per_class = 24 * 60 / 10
ten_mins_of_n_days = int(24 * 60 * days_lookback / 10) 
target_name = "Class"

def GetClass(i:int)->int:
    return math.floor(i/mins_per_class)

def create_failure_list() -> pd.DataFrame:
    failure_list = []
    for i, failure in enumerate(failures_gearbox):
        turbine_id = str(failures_gearbox["Turbine_ID"][i])
        failure_ts = str(failures_gearbox["Timestamp"][i])
        failure_datetime = datetime.fromisoformat(failure_ts)
        rounded_datetime = convert_round_minute_to_time(failure_datetime)
        for j in range(ten_mins_of_n_days):
            delta = timedelta(minutes=j*10)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            failure_list.append([turbine_id, datetime_formated.isoformat(), GetClass(j)])    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", target_name])
    return failure_df

failure_df_class  = create_failure_list()
#Der Feature-Datensatz wird mit den Labels zusammengeführt. Dabei ist besonders wichtig, dass der Bezug zu der jeweiligen Turbine bestehen bleibt.
labeled_df = pd.merge(merged_df, failure_df_class, on=["Turbine_ID", "Timestamp"], how="left")
labeled_df = labeled_df.reset_index(drop=True)

In [None]:
def create_failure_list(classes: list[str], days_per_class: int, target_name: str) -> pd.DataFrame:
    days_lookback = len(classes) * days_per_class
    ten_mins_of_n_days = int(24 * 60 * days_lookback / 10)
    failure_list = []
    for i, failure in enumerate(failures_gearbox):
        turbine_id = str(failures_gearbox["Turbine_ID"][i])
        failure_ts = str(failures_gearbox["Timestamp"][i])
        failure_datetime = datetime.fromisoformat(failure_ts)
        rounded_datetime = convert_round_minute_to_time(failure_datetime)
        for iterator, current_class in enumerate(classes):
            for j in range(ten_mins_of_n_days):
                delta = timedelta(minutes=j*10)
                # Prüfen ob obere und untere Schranke passen.
                is_in_class = delta >= timedelta(days=iterator*days_per_class) and delta < timedelta(days=(iterator+1) * days_per_class)
                if (is_in_class):
                    new_datetime = rounded_datetime - delta
                    datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
                    failure_list.append([turbine_id, datetime_formated.isoformat(), current_class])

    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", target_name])

    return failure_df

class_target_name = "Risk Level"
risk_levels = ["low", "high", "med-high", "medium", "low-med"]
days_per_class = 18

failure_df_multiclass = create_failure_list(classes=risk_levels, days_per_class=days_per_class, target_name=class_target_name)
labeled_df = pd.merge(labeled_df, failure_df_multiclass, on=["Turbine_ID", "Timestamp"], how="left")
labeled_df = labeled_df.reset_index(drop=True)

In [None]:
labeled_df[class_target_name].fillna("low", inplace = True)
labeled_df[target_name].fillna(90, inplace = True)

In [None]:
labeled_df[target_name].value_counts()

In [None]:
# Alle Daten ab August 2017 liegen im Testset
split_criterion_reg = labeled_df["Timestamp"] >= "2017-08-00T00:00:00+00:00"

test_gearbox = labeled_df[split_criterion_reg].reset_index(drop=True)#.iloc[:100].reset_index(drop=True)
train_gearbox = labeled_df[~split_criterion_reg].reset_index(drop=True)#.iloc[:100].reset_index(drop=True)

In [None]:
print(train_gearbox.head())

In [None]:
from utils.create_object import create_object
from utils.create_objects import create_objects

In [None]:
context = dict()
context["base"] = Base
context["session"] = Session(bind=engine)

data_meta = create_object(context, "data_meta")

groupings = ["train", "test"]
datapoint_mappings = {}
for i, grouping_val in enumerate(groupings):
    datapoint_mappings[grouping_val] = create_object(context, "datapoint_mappings", with_commit=True,
        data_meta_id = 1,
        grouping = grouping_val
    )

label_names = ["Class", "Risk Level"]
label_entries = {}
for i, label in enumerate(label_names):
    label_entries[label] = create_object(context, "label", with_commit=True,
        data_meta_id = 1,
        name = label,
        description = "TBD"
    )

time_name = "Timestamp"
filter_names = ["Turbine_ID"]
meta_info_names = filter_names + [time_name]
feature_names = [feature for feature in train_gearbox.columns if feature not in label_names and feature not in meta_info_names]
aggregated_meta_feature_list = meta_info_names + feature_names
aggregated_meta_feature_list = [feature for feature in aggregated_meta_feature_list]
feature_entries = {}
filter_entries = {}
for i, name in enumerate(aggregated_meta_feature_list):
    type_ = "data"
    if name==time_name:
        time = create_object(context, "time", with_commit=True,
            data_meta_id = 1,
            name = name,
            description = "TBD"
        )
    elif name in filter_names:
        filter = create_object(context, "filter", with_commit=True,
            data_meta_id = 1,
            name = name,
            description = "TBD"
        )
        filter_entries[name] = filter
    else:
        feature = create_object(context, "feature", with_commit=True,
            data_meta_id = 1,
            name = name,
            type_ = type_,
            description = "TBD"
        )
        feature_entries[name] = feature

def process_row(row, context, datapoint_id, mapping_id, feature_entries, filter_entries):
    create_object(context, "datapoint",
                              id=datapoint_id,
                              datapoint_mappings_id=mapping_id,
                              datetime=datetime.strptime(row[time_name][:19], "%Y-%m-%dT%H:%M:%S"))

    filter_values = [{'datapoint_id': datapoint_id, 'filter_id': filter_entries[key].id, 'value': value} 
                    for key, value in row.items() if key in filter_entries.keys()]

    # Process features
    feature_values = [{'datapoint_id': datapoint_id, 'feature_id': feature_entries[key].id, 'value': float(value)} 
                      for key, value in row.items() if key in feature_entries.keys()]

    create_object(context, "datapoint_rul_label", datapoint_id=datapoint_id, label_id=label_entries[label_names[0]].id, value=row[label_names[0]])
    create_object(context, "datapoint_class_label", datapoint_id=datapoint_id, label_id=label_entries[label_names[1]].id, value=row[label_names[1]])

    # Batch create datapoint_feature_value objects
    create_objects(context, "datapoint_feature_value", feature_values)

    # # Create datapoint_filter object
    create_objects(context, "datapoint_filter", filter_values)

def apply_to_row(row, args):
    mapping_id = int(args)
    # base datapoint ID on row id and the mapping id and make sure datapoint_ids are not overlapping with big integer
    datapoint_id = (mapping_id-1) * 1000000 + int(row.name) + 1
    process_row(row, context, datapoint_id, mapping_id, feature_entries, filter_entries)
    # commit every 5000 rows to prevent memory overflow
    if (int(row.name) % 5000 == 0 and int(row.name) != 0):
        context["session"].commit()
# Main loop
for grouping, dataset in {"test": test_gearbox, "train": train_gearbox}.items():
    # Apply the function to each row
    dataset.apply(apply_to_row, args=(datapoint_mappings[grouping].id,), axis=1)

# for all remaining datapoints
context["session"].commit()
