In [36]:
import pandas as pd
import math
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from datetime import datetime, timedelta, timezone
import pyodbc
from sqlalchemy import Column, Date, Integer, String, Numeric, create_engine, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, Float, ForeignKey, LargeBinary
from sqlalchemy.orm import mapper, registry, Session
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine, func, MetaData, Table
from sqlalchemy import create_engine, func, MetaData, Table, select, Column, Integer, String
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, mean_squared_error, balanced_accuracy_score 
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sqlalchemy import inspect
import scipy
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [57]:
#SQLAlchemy Setup
def get_engine():
    SERVER = 'localhost'
    DATABASE = 'metmast'
    engine = create_engine(f'mssql+pyodbc:///?odbc_connect=DRIVER={{ODBC Driver 18 for SQL Server}};SERVER={SERVER};DATABASE={DATABASE};Trusted_Connection=yes;TrustServerCertificate=yes;')
    return engine

engine = get_engine()
metadata = MetaData()
metadata.reflect(bind=engine)
Base = automap_base(metadata=metadata)
Base.prepare(autoload_with=engine)

In [38]:
#Load Trainingsdata
def loadData(tableName):
    # Get the train_data table
    table = Table(tableName, metadata)
    session = Session(bind=engine)    
    batch_size = 500
    query = session.query(table)
    #Load Query into Dataframe
    df = pd.DataFrame()
    for batch in pd.read_sql_query(query.statement, engine, chunksize=batch_size):
        df = pd.concat([df, batch], ignore_index=True)
        excluded_columns=["id"]
        if excluded_columns:
            df = df.drop(columns=excluded_columns)    
    return df

train_X = loadData("train_data")
train_y = loadData("train_data_label").drop(columns=["train_data_id", "Class"]) 
test_X = loadData("test_data")
test_y = loadData("test_data_label").drop(columns=["test_data_id", "Class"]) 

In [83]:
#Lazy declarations
def get_table_by_name(name):
    inspector = inspect(engine)
    return inspector.get_table(name)

def get_feature_id_by_name(Base, name):
    query = session.query(metadata.tables["feature"])
    df = pd.DataFrame()
    for batch in pd.read_sql_query(query.statement, engine, chunksize= 5):
        df = pd.concat([df, batch], ignore_index=True)
    for index, row in df.iterrows():
        if(row["name"]==name):
            return row["id"]
        
def getNextIdForTable(Base, name):
    table = Base.classes[name]
    return session.query(table.id).count() + 1 
  
def create_object(Base, class_name, **kwargs):
    mapped_class = Base.classes[class_name]
    # Create an instance of the mapped class
    obj = mapped_class()
    # Set the object's attributes
    for key, value in kwargs.items():
        setattr(obj, key, value)
    return obj
    
#Create meta informations if not existent
class Train_process(Base):
    __tablename__ = 'train_process'    
    __table_args__ = {'extend_existing': True} # TODO: später mit keep_existing ersetzen
    id = Column(Integer, primary_key=True)  
    placeHolder = Column(String)
       
class Train_process_init_parameter(Base):
    __tablename__ = 'train_process_init_parameter'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)
    Column("train_process_id", Integer, ForeignKey(Train_process.id))
    min_threshold_feature_variance = Column(Float)
    max_threshold_feature_variance = Column(Float)  

class Hyperparameter(Base):
    __tablename__ = 'hyperparameter'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)
    
    threshold_feature_variance = Column(Float)
    max_depth = Column(Integer)
    min_samples_leaf = Column(Integer)
    random_state = Column(Integer)
    max_features = Column(Integer)
    criterion = Column(String)
    
class Train_process_iteration(Base):
    __tablename__ = 'train_process_iteration'
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)    
    Column("train_process_id", Integer, ForeignKey(Train_process.id))
    Column("train_process_id", Integer, ForeignKey(Hyperparameter.id)) 
    index = Column(Integer)
    
class Train_process_stage(Base):
    __tablename__ = 'train_process_stage'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)
    
class Stage(Base):
    __tablename__ = 'stage'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)    
    name = Column(String(100))
    descritption = Column(String(500))
    
class Train_process_score(Base):
    __tablename__ = 'train_process_score'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)
    
class Train_process_iteration_score(Base):
    __tablename__ = 'train_process_iteration_score'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)
    balanced_accuracy_scoreatures = Column(Integer)
    
class Train_process_iteration_compute_result(Base):
    __tablename__ = 'train_process_iteration_compute_result'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)

class Dropped_feature_variance_filter(Base):
    __tablename__ = 'dropped_feature_variance_filter'
    __table_args__ = {'extend_existing': True} 
    id = Column(Integer, primary_key=True)
    Column("train_process_iteration_compute_result_id", Integer, ForeignKey(Train_process_iteration_compute_result.id)) 
    Column("feature_id", Integer, ForeignKey(metadata.tables["feature"].c.id)) 
    feature_variance = Column(Float)
    
class Train_process_train_data_junction(Base):
    __tablename__ = 'train_process_train_data_junction'    
    __table_args__ = {'extend_existing': True}
    id = Column(Integer, primary_key=True)  
    Column("train_process_id", Integer, ForeignKey(Train_process.id))
    Column("train_data_id", Integer, ForeignKey(metadata.tables["train_data"].c.id))
        
metadata.create_all(engine)

  class Train_process(Base):
  class Train_process_init_parameter(Base):
  class Hyperparameter(Base):
  class Train_process_iteration(Base):
  class Train_process_stage(Base):
  class Stage(Base):
  class Train_process_score(Base):
  class Train_process_iteration_score(Base):
  class Train_process_iteration_compute_result(Base):
  class Dropped_feature_variance_filter(Base):
  class Train_process_train_data_junction(Base):


In [40]:
#Reload
engine = get_engine()
metadata = MetaData()
metadata.reflect(bind=engine)
Base = automap_base(metadata=metadata)
Base.prepare(autoload_with=engine)

In [44]:
session = Session(engine)
#Get table refrences
train_process_table = Table("train_process", metadata)
train_process_init_parameter_table = Table("train_process_init_parameter", metadata)
#Load initial parameters - use latest parameters if none were explicitly given
count_process = session.query(Base.classes.train_process.id).count()
count_paras = session.query(Base.classes.train_process_init_parameter.id).count()
init_parameters = session.query(train_process_init_parameter_table).order_by(Train_process_init_parameter.id.desc()).limit(1)[0]
if(count_process == count_paras):    
    init_parameters_clone = Base.classes.train_process_init_parameter()
    for column in Base.classes.train_process_init_parameter.__table__.columns:
        if(column.name!="id"):
            setattr(init_parameters_clone, column.name, getattr(init_parameters, column.name))
        else:
            setattr(init_parameters_clone, column.name, getattr(init_parameters, column.name ) + 1)
    session.add(init_parameters_clone)
    session.commit()
#Create meta db Entrys
train_process = create_object(Base, "train_process", id = count_process + 1)
#train_process = Base.classes.train_process()
#train_process.id = count_process + 1
train_process_score = create_object(Base, "train_process_score", id = train_process.id)
session.add(train_process)
session.add(train_process_score)
session.commit()

In [45]:
#Filtern von aussageschwachen Features
def get_signals_with_low_variance(df, threshold=0.10):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cont_data = df.select_dtypes(include=numerics)
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(cont_data)
    inverted_list = ~np.array(selector.get_support())
    return cont_data.columns[inverted_list].tolist()

def do_variance_filter(train_X_, test_X_, threshold):
    low_variant_signals = set()
    low_variant_signals.update(get_signals_with_low_variance(train_X))
    train_X_ = train_X_.drop(columns=list(low_variant_signals))
    test_X_ = test_X_.drop(columns=list(low_variant_signals))
    return train_X_, test_X_, low_variant_signals

In [60]:
def sample_snapshot(train_X_, train_y_):
    used_indexes = None
    value_counts = train_y_["Risk Level"].value_counts()
    for i in train_y_["Risk Level"].unique():
        drop_amount = value_counts[i]-value_counts.min()
        class_df = train_y_[train_y_["Risk Level"] == i]
        drop_indexes = class_df.sample(n=drop_amount).index
        train_y_ = train_y_.drop(drop_indexes)
        train_X_ = train_X_.drop(drop_indexes)
        if(used_indexes is None):
            used_indexes = list(drop_indexes)
        else:
            used_indexes.append(list(drop_indexes))
    train_y_ = train_y_.reset_index(drop=True)
    train_X_ = train_X_.reset_index(drop=True)
    return train_X_,train_y_, used_indexes

def get_hyperopt_args():
    args = {
        "max_depth": hp.choice('max_depth', range(1,100)),
        "min_samples_leaf": hp.choice("min_samples_leaf", range(1,15)),
        "random_state": hp.randint("random_state", 3000),
        "max_features": hp.choice('max_features', range(1,50)),
        "criterion": hp.choice('criterion', ["gini", "entropy"]),
        "variance_threshold_var_fac": hp.randint("variance_threshold", 100),
        #"normalize": hp.choice('normalize', [0, 1])
    }
    return args

def do_train_run(train_X_:DataFrame, train_y_:DataFrame, test_X_:DataFrame, test_y_:DataFrame, args):    
    #Feature selection TODO: mutual info & correlation filter; performance
    #Variance Filter
    threshold = args["variance_threshold_var_fac"]*args["variance_threshold_fac"]+args["variance_threshold_floor"]
    args["hyperparameter"].threshold_feature_variance = threshold
    
    train_X_, test_X_, low_variant_signals = do_variance_filter(train_X_, test_X_, threshold)    
    
    for i, signal in enumerate(low_variant_signals):
       dropped_feature_variance_filter = create_object(Base, "dropped_feature_variance_filter",
                                                       id = getNextIdForTable(Base, "dropped_feature_variance_filter") + i + 1,
                                                       train_process_iteration_compute_result_id = args["train_process_iteration_compute_result"])
    
    args["hyperparameter"].max_depth = args["max_depth"]
    args["hyperparameter"].min_samples_leaf = args["min_samples_leaf"]
    args["hyperparameter"].random_state = args["random_state"]
    args["hyperparameter"].max_features = args["max_features"]
    args["hyperparameter"].criterion = args["criterion"]    
    
    dtr = DecisionTreeClassifier(
        max_depth = args["max_depth"],
        min_samples_leaf = args["min_samples_leaf"],
        random_state = args["random_state"],
        max_features = args["max_features"],
        criterion = args["criterion"],
        #normalize = args["normalize"],
    )
    dtr.fit(train_X_, train_y_)
    eval_predict = dtr.predict(test_X_)    
    #TODO:Evaluate using full train data vs validation data or split test data...; intention:validation integrity, validation data is incorporated in trainings process
    accuracy = balanced_accuracy_score(test_y_, eval_predict) 
    args["train_process_iteration_score"].balanced_accuracy_score = accuracy
    return accuracy

def train_run(args):
    train_X_ = args["train_X_"]
    train_y_ = args["train_y_"]
    test_X_ = args["test_X_"]
    test_y_ = args["test_y_"]
    return 1 - do_train_run(train_X_, train_y_, test_X_, test_y_, args)

def f(args):    
    acc = train_run(args)
    return {'loss': acc, 'status': STATUS_OK}

def do_train_process_iteration(args):
    trials = Trials()    
    #Create SQL Obj
    session = args["session"]
    hyperparameter = create_object(Base, "hyperparameter",
                                   id = getNextIdForTable(Base, "hyperparameter"))
    
    train_process_iteration = create_object(Base, "train_process_iteration",
                                            id = getNextIdForTable(Base, "train_process_iteration"),
                                            train_process_id = args["train_process"].id)
    
    train_process_iteration_score = create_object(Base, "train_process_iteration_score",
                                                id = getNextIdForTable(Base, "train_process_iteration_score"),
                                                train_process_iteration_id = train_process_iteration.id)
    
    train_process_iteration_compute_result = create_object(Base, "train_process_iteration_compute_result",
                                                           id = getNextIdForTable(Base, "train_process_iteration_compute_result"),
                                                           train_process_iteration_id = train_process_iteration.id)    
    session.add(hyperparameter)
    session.add(train_process_iteration)
    session.commit()
    args["train_process_iteration"] = train_process_iteration
    args["hyperparameter"] = hyperparameter
    args["train_process_iteration_score"] = train_process_iteration_score
    args["train_process_iteration_compute_result"] = train_process_iteration_compute_result
    
    scores = fmin(f, args, algo=tpe.suggest, max_evals = 10, trials=trials)    
    print(scores)
    

def do_train_prcess(train_process, paras):
    #Even out labels in train_data TODO: andere Methoden
    train_X_,train_y_, used_indexes = sample_snapshot(train_X, train_y)
    for index in used_indexes:
        train_process_train_data_junction = create_object(Base, "train_process_train_data_junction",
                                                          train_process_id =  train_process.id,
                                                          train_data_id = index)    
        session = Session(engine)
        session.add(train_process_train_data_junction)
    session.commit()
    #Construct parameters
    variance_threshold_floor = paras.min_threshold_feature_variance
    variance_threshold_fac = paras.max_threshold_feature_variance - paras.min_threshold_feature_variance/100
    #Assign parameters  
    args = get_hyperopt_args()
    args["variance_threshold_floor"] = variance_threshold_floor
    args["variance_threshold_fac"] = variance_threshold_fac
    args["train_X_"] = train_X_
    args["train_y_"] = train_y_
    args["test_X_"] = test_X.reset_index(drop=True)
    args["test_y_"] = test_y.reset_index(drop=True)
    args["train_process"] = train_process
    args["session"] = Session(bind = engine)
    for i in range (4):
        args["index"] = i
        do_train_process_iteration(args)

do_train_prcess(train_process, init_parameters)

SyntaxError: invalid syntax (2357110951.py, line 41)

In [None]:
#TODO: multithreaded implementation
def find_missing_elements(full_list, partial_list):
    missing_elements = []
    for element in full_list:
        if element not in partial_list:
            missing_elements.append(element)
    return missing_elements

def get_redundant_pairs(X_train: pd.DataFrame) -> set:
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = set()
    cols = X_train.columns
    for i in range(0, X_train.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(X_train: pd.DataFrame, thresholds=[0.5]) -> list:
    au_corr = X_train.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(X_train)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    au_corrs = []
    for threshold in thresholds:
        au_corrs.append(au_corr[au_corr > threshold])
    return au_corrs

def get_indexes_to_drop(corr_df: pd.DataFrame, X_train: pd.DataFrame, y_train: pd.DataFrame) -> set:

    indexes_to_drop = set()
    le = LabelEncoder()
    y_train = pd.DataFrame(le.fit_transform(y_train), columns=[target_name])[target_name]

    for row in corr_df.index:
        if (X_train[row[0]].corr(y_train)) > (X_train[row[1]].corr(y_train)):
            indexes_to_drop.add(row[1])
        else:
            indexes_to_drop.add(row[0])
    
    return indexes_to_drop

def corr_filter(X_train: pd.DataFrame, y_train: pd.DataFrame, thresholds=[0.5]) -> list[pd.DataFrame]:
    remaining_dfs = []
    corr_dfs = get_top_abs_correlations(X_train, thresholds)
    for corr_df in corr_dfs:
        indexes_to_drop = get_indexes_to_drop(corr_df, X_train, y_train)
        remaining_df = X_train.drop(labels=indexes_to_drop, axis=1)
        remaining_dfs.append(remaining_df)
    return remaining_dfs

def find_best_threshold_corr(train_X, train_y, test_X, train_Y, target_name, thresholds):
    dfs = corr_filter(train_X, train_y, thresholds)
    best_threshold = None
    highest_accuracy = 0.0
    best_dropped = None
    for i, df in enumerate(dfs):
        filtered_X = df
        clf = DecisionTreeClassifier(random_state=0)
        clf.fit(filtered_X, train_y)        
        dropped = find_missing_elements(test_X.columns, filtered_X.columns)
        filtered_test = test_X.drop(columns=dropped)        
        y_pred = clf.predict(filtered_test)
        accuracy = balanced_accuracy_score(train_Y, y_pred)
        if accuracy > highest_accuracy:
            best_threshold = thresholds[i]
            highest_accuracy = accuracy
            best_dropped = dropped
    return best_dropped


def do_correlation_filter(train_X_, train_y_, test_X_, test_y_,):
    target_name = "Risk Level"
    cols_to_drop = find_best_threshold_corr(train_X, train_y[target_name], test_X, test_y[target_name], target_name, [0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95])