In [None]:
import math
from datetime import datetime, timedelta, timezone

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyodbc
import scipy
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from pandas import DataFrame
from sklearn.base import BaseEstimator
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (balanced_accuracy_score, f1_score,
                             mean_squared_error, precision_score, recall_score)
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sqlalchemy import (Column, Date, Float, ForeignKey, Integer, LargeBinary,
                        MetaData, Numeric, String, Table, create_engine, func,
                        inspect, select)
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, mapper, registry

from utils.handle_engine import get_engine

In [None]:
#Loading Data
failures_2016 = pd.read_csv("../data/init/failures-2016.csv", sep=";")
failures_2017 = pd.read_csv("../data/init/failures-2017.csv", sep=";")
metmast_2016 = pd.read_csv("../data/init/metmast-2016.csv", sep=";")
metmast_2017 = pd.read_csv("../data/init/metmast-2017.csv", sep=";")
signals_2016 = pd.read_csv("../data/init/signals-2016.csv", sep=";")
signals_2017 = pd.read_csv("../data/init/signals-2017.csv", sep=";")

In [None]:
# Signale beider Jahre kombinieren
signals = pd.concat([signals_2016, signals_2017])

In [None]:
turbine_names = signals["Turbine_ID"].unique()

In [None]:
def create_df_for_each_turbine(signals):
    turbine_dfs = list();

    for turbine in turbine_names:
        turbine_df = signals[signals["Turbine_ID"] == turbine]
        turbine_df = turbine_df.sort_values("Timestamp")
        turbine_df = turbine_df.reset_index(drop=True)
        turbine_dfs.append(turbine_df)

    return turbine_dfs

turbine_dfs = create_df_for_each_turbine(signals)

In [None]:
#Zusammenführen und sortieren
metmast = pd.concat([metmast_2016, metmast_2017])
metmast = metmast.sort_values("Timestamp")

In [None]:
# drop broken met data
metmast = metmast.drop(["Min_Winddirection2", "Max_Winddirection2", "Avg_Winddirection2", "Var_Winddirection2"], axis=1)

In [None]:
# Fill met data
metmast = metmast.fillna(method = "ffill")
metmast = metmast.fillna(method = "bfill")
metmast.isna().sum().sum()

In [None]:
failures = pd.concat([failures_2016, failures_2017])

In [None]:
#Mergen
def JoinMetamast(df:pd.DataFrame):
    df = df.fillna(method = "ffill")
    df = df.fillna(method = "bfill")
    df = pd.merge(df, metmast, on="Timestamp", how="left")
    df = df.fillna(method = "ffill")
    df = df.fillna(method = "bfill")
    df.isna().sum().sum()
    return df

merged = list()
for turbine_df in turbine_dfs:
    merged.append(JoinMetamast(turbine_df))
merged_df = pd.concat(merged)

In [None]:
failures_gearbox = failures[failures["Component"] == "GEARBOX"]
failures_gearbox.reset_index(drop=True, inplace=True)

In [None]:
#Util Functions
def get_round_minute_diff(datetime_in: datetime) -> timedelta:
    min = datetime_in.minute
    rounded_min = round(min, -1)
    diff = rounded_min - min
    return timedelta(minutes=diff)

def convert_round_minute_to_time(datetime_in: datetime) -> datetime:
    td = get_round_minute_diff(datetime_in)
    return datetime_in + td

In [None]:
days_lookback = 90
mins_per_class = 24 * 60 / 10
ten_mins_of_n_days = int(24 * 60 * days_lookback / 10) 
target_name = "Class"

def GetClass(i:int)->int:
    return math.floor(i/mins_per_class)

def create_failure_list() -> pd.DataFrame:
    failure_list = []
    for i, failure in enumerate(failures_gearbox):
        turbine_id = str(failures_gearbox["Turbine_ID"][i])
        failure_ts = str(failures_gearbox["Timestamp"][i])
        failure_datetime = datetime.fromisoformat(failure_ts)
        rounded_datetime = convert_round_minute_to_time(failure_datetime)
        for j in range(ten_mins_of_n_days):
            delta = timedelta(minutes=j*10)
            new_datetime = rounded_datetime - delta
            datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
            failure_list.append([turbine_id, datetime_formated.isoformat(), GetClass(j)])    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", target_name])
    return failure_df

failure_df_class  = create_failure_list()
#Der Feature-Datensatz wird mit den Labels zusammengeführt. Dabei ist besonders wichtig, dass der Bezug zu der jeweiligen Turbine bestehen bleibt.
labeled_df = pd.merge(merged_df, failure_df_class, on=["Turbine_ID", "Timestamp"], how="left");
labeled_df = labeled_df.reset_index(drop=True)

In [None]:
def create_failure_list(classes: list[str], days_per_class: int, target_name: str) -> pd.DataFrame:
    days_lookback = len(classes) * days_per_class
    ten_mins_of_n_days = int(24 * 60 * days_lookback / 10)
    failure_list = []
    for i, failure in enumerate(failures_gearbox):
        turbine_id = str(failures_gearbox["Turbine_ID"][i])
        failure_ts = str(failures_gearbox["Timestamp"][i])
        failure_datetime = datetime.fromisoformat(failure_ts)
        rounded_datetime = convert_round_minute_to_time(failure_datetime)
        for iterator, current_class in enumerate(classes):
            for j in range(ten_mins_of_n_days):
                delta = timedelta(minutes=j*10)
                # Prüfen ob obere und untere Schranke passen.
                is_in_class = delta >= timedelta(days=iterator*days_per_class) and delta < timedelta(days=(iterator+1) * days_per_class)
                if (is_in_class):
                    new_datetime = rounded_datetime - delta
                    datetime_formated = new_datetime.replace(tzinfo=timezone.utc)
                    failure_list.append([turbine_id, datetime_formated.isoformat(), current_class])
    
    failure_df = pd.DataFrame(failure_list, columns=["Turbine_ID", "Timestamp", target_name])

    return failure_df

class_target_name = "Risk Level"
risk_levels = ["low", "high", "med-high", "medium", "low-med"]
days_per_class = 18

failure_df_multiclass = create_failure_list(classes=risk_levels, days_per_class=days_per_class, target_name=class_target_name)
labeled_df = pd.merge(labeled_df, failure_df_multiclass, on=["Turbine_ID", "Timestamp"], how="left"); 
labeled_df = labeled_df.reset_index(drop=True)

In [None]:
labeled_df[class_target_name].fillna("low", inplace = True)
labeled_df[target_name].fillna(90, inplace = True)

In [None]:
labeled_df[target_name].value_counts()

In [None]:
# Alle Daten ab August 2017 liegen im Testset
split_criterion_reg = labeled_df["Timestamp"] >= "2017-08-00T00:00:00+00:00"

test_gearbox = labeled_df[split_criterion_reg].reset_index(drop=True)#.iloc[:100].reset_index(drop=True)
train_gearbox = labeled_df[~split_criterion_reg].reset_index(drop=True)#.iloc[:100].reset_index(drop=True)

In [None]:
engine = get_engine()
mapper_registry = registry()

label_columns = list(["Class", "Risk Level"])
feature_columns = list(train_gearbox.columns)
feature_columns = [x for x in feature_columns if x not in label_columns]
meta_columns = list(train_gearbox.select_dtypes(include=['object']).columns)
meta_columns = [x for x in meta_columns if x not in label_columns]
feature_columns = [x for x in feature_columns if x not in meta_columns]

# Define a class to map to the table
class Train_data:
    pass

# Create columns
feature_column_types = [Float] * len(feature_columns)
f_columns = [Column('id', Integer, primary_key=True, autoincrement=True)] + [
    Column(name, type) for name, type in zip(feature_columns, feature_column_types)
]

# Create table
metadata = MetaData()
train_data_table = Table('train_data', metadata, *f_columns)
metadata.create_all(engine)

# Map the class imperatively
mapper_registry.map_imperatively(
    class_ = Train_data,
    local_table = train_data_table
)

# Define a class to map to the table
class Train_data_label:
    pass

# Create columns
label_column_types = list([Integer, String])
l_columns = [Column('id', Integer, primary_key=True, autoincrement=True), 
             Column("train_data_id", Integer, ForeignKey(Train_data.id))] + [
    Column(name, type) for name, type in zip(label_columns, label_column_types)    
]

# Create table
metadata = MetaData()
train_data_label_table = Table('train_data_label', metadata, *l_columns)
metadata.create_all(engine)

# Map the class imperatively
mapper_registry.map_imperatively(
    class_ = Train_data_label,
    local_table = train_data_label_table
)

# Define a class to map to the table
class Train_data_meta:
    pass

# Create columns
meta_column_types = [String] * len(meta_columns)
m_columns = [Column('id', Integer, primary_key=True, autoincrement=True), 
             Column("train_data_id", Integer, ForeignKey(Train_data.id))] + [
    Column(name, type) for name, type in zip(meta_columns, meta_column_types)    
]

# Create table
metadata = MetaData()
train_data_meta_table = Table('train_data_meta', metadata, *m_columns)
metadata.create_all(engine)

# Map the class imperatively
mapper_registry.map_imperatively(
    class_ = Train_data_meta,
    local_table = train_data_meta_table
)

In [None]:
engine = get_engine()
mapper_registry = registry()

label_columns = list(["Class", "Risk Level"])
feature_columns = list(test_gearbox.columns)
feature_columns = [x for x in feature_columns if x not in label_columns]
meta_columns = list(test_gearbox.select_dtypes(include=['object']).columns)
meta_columns = [x for x in meta_columns if x not in label_columns]
feature_columns = [x for x in feature_columns if x not in meta_columns]

# Define a class to map to the table
class test_data:
    pass

# Create columns
feature_column_types = [Float] * len(feature_columns)
f_columns = [Column('id', Integer, primary_key=True, autoincrement=True)] + [
    Column(name, type) for name, type in zip(feature_columns, feature_column_types)
]

# Create table
metadata = MetaData()
test_data_table = Table('test_data', metadata, *f_columns)
metadata.create_all(engine)

# Map the class imperatively
mapper_registry.map_imperatively(
    class_ = test_data,
    local_table = test_data_table
)

# Define a class to map to the table
class test_data_label:
    pass

# Create columns
label_column_types = list([Integer, String])
l_columns = [Column('id', Integer, primary_key=True, autoincrement=True), 
             Column("test_data_id", Integer, ForeignKey(test_data.id))] + [
    Column(name, type) for name, type in zip(label_columns, label_column_types)    
]

# Create table
metadata = MetaData()
test_data_label_table = Table('test_data_label', metadata, *l_columns)
metadata.create_all(engine)

# Map the class imperatively
mapper_registry.map_imperatively(
    class_ = test_data_label,
    local_table = test_data_label_table
)

# Define a class to map to the table
class test_data_meta:
    pass

# Create columns
meta_column_types = [String] * len(meta_columns)
m_columns = [Column('id', Integer, primary_key=True, autoincrement=True), 
             Column("test_data_id", Integer, ForeignKey(test_data.id))] + [
    Column(name, type) for name, type in zip(meta_columns, meta_column_types)    
]

# Create table
metadata = MetaData()
test_data_meta_table = Table('test_data_meta', metadata, *m_columns)
metadata.create_all(engine)

# Map the class imperatively
mapper_registry.map_imperatively(
    class_ = test_data_meta,
    local_table = test_data_meta_table
)

In [None]:
session = Session(bind=engine)
print(session.query(Train_data).count())

In [None]:
#Start Session
session = Session(bind=engine)
#Create objects to insert
train_data_list, train_data_label_list, train_data_meta_list, test_data_list, test_data_label_list, test_data_meta_list = list(),list(),list(),list(),list(),list()

train_count = session.query(Train_data).count()
for index, row in train_gearbox.iterrows():
    train_data_piece = Train_data()
    train_data_label_piece = Train_data_label()
    train_data_meta_piece = Train_data_meta()

    train_data_label_piece.train_data_id = 0 + index + 1
    train_data_meta_piece.train_data_id = 0 + index + 1

    for column in train_gearbox.columns:
        if(column in label_columns):
            setattr(train_data_label_piece, column, row[column])
        elif(column in meta_columns):
            setattr(train_data_meta_piece, column, row[column])
        else:
            setattr(train_data_piece, column, row[column])

    train_data_list.append(train_data_piece)
    train_data_label_list.append(train_data_label_piece)
    train_data_meta_list.append(train_data_meta_piece)

test_count = session.query(Train_data).count()
for index, row in test_gearbox.iterrows():
    test_data_piece = test_data()
    test_data_label_piece = test_data_label()
    test_data_meta_piece = test_data_meta()

    test_data_label_piece.test_data_id = 0 + index + 1
    test_data_meta_piece.test_data_id = 0 + index + 1

    for column in test_gearbox.columns:
        if(column in label_columns):
            setattr(test_data_label_piece, column, row[column])
        elif(column in meta_columns):
            setattr(test_data_meta_piece, column, row[column])
        else:
            setattr(test_data_piece, column, row[column])


    test_data_list.append(test_data_piece)
    test_data_label_list.append(test_data_label_piece)
    test_data_meta_list.append(test_data_meta_piece)

In [None]:
#Commit Objects
session.add_all(train_data_list)
session.add_all(train_data_label_list)
session.add_all(train_data_meta_list)
session.add_all(test_data_list)
session.add_all(test_data_label_list)
session.add_all(test_data_meta_list)
session.commit()

In [None]:
#SQLAlchemy Setup
engine = get_engine()
metadata = MetaData()
metadata.reflect(bind=engine)
Base = automap_base(metadata=metadata)
Base.prepare(autoload_with=engine)

In [None]:
class Feature(Base):
    __tablename__ = 'feature'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    name = Column(String)
    description = Column(String)

class Feature_statistic(Base):
    __tablename__ = 'feature_statistic'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    Column("feature_id", Integer, ForeignKey(Feature.id))
    mean = Column(Integer)
    median = Column(Integer)

class Label(Base):
    __tablename__ = 'label'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    name = Column(String)
    description = Column(String)

class Label_statistic(Base):
    __tablename__ = 'label_statistic'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)
    Column("label_id", Integer, ForeignKey(Label.id))
    mean = Column(Integer)
    median = Column(Integer)

metadata.create_all(engine)


In [None]:
engine = get_engine()
metadata = MetaData()
metadata.reflect(bind=engine)
Base = automap_base(metadata=metadata)
Base.prepare(autoload_with=engine)

In [None]:
#Load Trainingsdata
def loadData(tableName):
    # Get the train_data table
    table = Table(tableName, metadata)
    session = Session(bind=engine)
    batch_size = 500
    query = session.query(table)
    #Load Query into Dataframe
    df = pd.DataFrame()
    for batch in pd.read_sql_query(query.statement, engine, chunksize=batch_size):
        df = pd.concat([df, batch], ignore_index=True)
        excluded_columns=["id"]
        if excluded_columns:
            df = df.drop(columns=excluded_columns)
    return df

train_X = loadData("train_data")
train_y = loadData("train_data_label").drop(columns=["train_data_id", "Class"])

In [None]:
def create_object(Base, class_name, **kwargs):
    mapped_class = Base.classes[class_name]
    # Create an instance of the mapped class
    obj = mapped_class()
    # Set the object's attributes
    for key, value in kwargs.items():
        setattr(obj, key, value)
    return obj


session = Session(engine)
for i, feature in enumerate(train_X.columns):
    featureObj = create_object(Base, "feature", id = i + 1,  name = feature, description = "to be added manually")
    feature_statisticObj = create_object(Base, "feature_statistic", id = i + 1, feature_id = i + 1, mean = train_X[feature].mean(), median = train_X[feature].median())
    session.add(featureObj)
    session.add(feature_statisticObj)
    session.commit()

session = Session(engine)
for i, label in enumerate(train_y.columns):
    labelObj = create_object(Base, "label", id = i + 1,  name = label, description = "to be added manually")
    feature_statisticObj = create_object(Base, "label_statistic", id = i + 1, feature_id = i + 1, mean = train_X[feature].mean(), median = train_X[feature].median())
    session.add(labelObj)
    session.add(feature_statisticObj)
    session.commit()