In [1]:
import joblib
import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

import ray
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from ray import tune
from ray.tune.sklearn import TuneSearchCV
from sklearn.model_selection import train_test_split

from preprocessing import get_data

In [4]:
ray.init("ray://localhost:10001") ## VM

SIGTERM handler is not set because current thread is not the main thread.


0,1
Python version:,3.10.12
Ray version:,2.8.0


In [20]:
@ray.remote(num_cpus=2)
def train_and_tune_extra_tree_model(sp500_data):
    X = sp500_data.drop(['Target'], axis=1)
    train_x, test_x, train_y, test_y = train_test_split(X, sp500_data['Target'], test_size=0.25, random_state=42)
    
    model = ExtraTreesClassifier(random_state=42)
    # Hyperparameter, die getunt werden sollen
    param_distributions = {
        'n_estimators': tune.randint(100, 2000),
        'max_depth': tune.randint(100, 1000),
        'min_samples_split': tune.choice([1, 5, 10]),
        'min_samples_leaf': tune.choice([1, 2, 8]),
        'max_features': tune.choice(['auto', 'sqrt', 'log2'])
    }

    tuner = TuneSearchCV(
        model,
        param_distributions,
        n_trials=15,  # Anzahl der Durchläufe
        early_stopping=False,  # Frühzeitiges Stoppen für schlecht abschneidende Trials
        max_iters=13,  # Maximale Anzahl von Iterationen pro Trial
        search_optimization="random",  # Optimierungsalgorithmus
        cv=10,  # Kreuzvalidierung
        random_state=42,
    )
    tuner.fit(train_x, train_y)
    best_model = tuner.best_estimator_
    #joblib.dump(best_model, './data/predict_model/best_extra_tree_model.pkl')
    predictions = best_model.predict(test_x)
    accuracy = accuracy_score(test_y, predictions)
    print(f"Best model parameters: {tuner.best_params_}")
    print(f"Test Accuracy: {accuracy}")

    return best_model, accuracy

In [21]:
sp500_data, last_day_df = get_data()

[*********************100%%**********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sp500_data[f"SMA {time_period}"]      = ta.SMA(inputs, timeperiod = time_period)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sp500_data[f"EMA {time_period}"]      = ta.EMA(inputs, timeperiod = time_period)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sp500_data[f"EMA {20}"]               = ta.

In [28]:
def log_to_mlflow(model, accuracy):
    mlflow.set_experiment("sp500_prediction")
    mlflow.set_tracking_uri("http://localhost:5000")
    best_extra_tree = "best_extra_tree_model"

    with mlflow.start_run():
        mlflow.sklearn.log_model(model, "model")
        mlflow.log_metric("accuracy", accuracy)
        run_id = mlflow.active_run().info.run_uuid
        client = MlflowClient()
        try:
            registered_model = client.get_registered_model(best_extra_tree)
        except:
            registered_model = None

        if not registered_model:
            client.create_registered_model(best_extra_tree)
            client.create_model_version(name=best_extra_tree,
                                        source=f"runs:/{run_id}/model",
                                        run_id=run_id)
        else:
            latest_version = client.get_latest_versions(best_extra_tree, stages=["Production"])[0]
            latest_metrics = client.get_run(latest_version.run_id).data.metrics
            if "accuracy" in latest_metrics:
                latest_accuracy = latest_metrics["accuracy"]
                if accuracy > latest_accuracy:
                    version_info = client.create_model_version(name=best_extra_tree,
                                                               source=f"runs:/{run_id}/model",
                                                               run_id=run_id)

                    client.transition_model_version_stage(
                        name=version_info,
                        version=version_info.version,
                        stage="Production"
                    )

                    print("New model registered as best model!")
                else:
                    print("The new model isn't better")

In [23]:
best_model, accuracy = ray.get(train_and_tune_extra_tree_model.remote(sp500_data))

[36m(train_and_tune_extra_tree_model pid=2315596)[0m ╭───────────────────────────────────────────────────────────────────╮
[36m(train_and_tune_extra_tree_model pid=2315596)[0m │ Configuration for experiment     _Trainable_2023-11-16_19-46-04   │
[36m(train_and_tune_extra_tree_model pid=2315596)[0m ├───────────────────────────────────────────────────────────────────┤
[36m(train_and_tune_extra_tree_model pid=2315596)[0m │ Search algorithm                 BasicVariantGenerator            │
[36m(train_and_tune_extra_tree_model pid=2315596)[0m │ Scheduler                        FIFOScheduler                    │
[36m(train_and_tune_extra_tree_model pid=2315596)[0m │ Number of trials                 15                               │
[36m(train_and_tune_extra_tree_model pid=2315596)[0m ╰───────────────────────────────────────────────────────────────────╯
[36m(train_and_tune_extra_tree_model pid=2315596)[0m 
[36m(train_and_tune_extra_tree_model pid=2315596)[0m View detailed 

[36m(train_and_tune_extra_tree_model pid=2315596)[0m [output] This will use the new output engine with verbosity 0. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


[36m(train_and_tune_extra_tree_model pid=2315596)[0m 


[36m(train_and_tune_extra_tree_model pid=2315596)[0m   results["rank_%s" % key_name] = np.asarray(


[36m(train_and_tune_extra_tree_model pid=2315596)[0m Best model parameters: {'n_estimators': 869, 'max_depth': 443, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
[36m(train_and_tune_extra_tree_model pid=2315596)[0m Test Accuracy: 0.803921568627451


In [29]:
log_to_mlflow(best_model, accuracy)
# mlflow models serve -m "models:/best_extra_tree_model/Production" -h 0.0.0.0 -p 1234

2023/11/16 19:57:22 INFO mlflow.tracking.fluent: Experiment with name 'sp500_prediction' does not exist. Creating a new experiment.
2023/11/16 19:58:35 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best_extra_tree_model, version 1
