Read in the data

In [1]:
import pandas as pd
import os

def logger(input_string: str) -> None:
    file_path = "output/log.txt"

    if not os.path.exists(file_path):
        with open(file_path, "w") as file:
            file.write(input_string + "\n")
    else:
        with open(file_path, "a") as file:
            file.write(input_string + "\n")
    print("Logger Message: " + input_string)

time_start = pd.Timestamp.now()

logger("\n\n_________________________________________________________________")
logger("New script run: " + str(time_start))

Logger Message: 

_________________________________________________________________
Logger Message: New script run: 2024-03-23 16:51:37.672885


In [2]:
try:
    data = pd.read_csv("data_raw.csv")
except Exception as e:
    logger("Error: " + str(e))

First row is named "data" is the date in YYYYMMMDD so we format to daytime then sort the values

In [3]:
try:
    data["date"] = pd.to_datetime(data[data.columns[0]], format="%Y%m%d")
    data.drop(data.columns[0], axis=1, inplace=True)
    data.sort_values(by='date', inplace=True)
    data.reset_index(drop=True, inplace=True)
except Exception as e:
    logger("Error: " + str(e))

On weekend the exchange is closed so we need to fill the missing values with linear interpolation

In [4]:
try:
    min_date = data["date"].min()
    max_date = data["date"].max()
    all_dates = pd.date_range(start=min_date, end=max_date)

    todo_data = ["date_global_quote","open_price","high_price","low_price","closing_price","volume"]
    data_helper = data[todo_data].copy()
    #get a list with only the dates we have data for
    data_helper["date_global_quote"] = pd.to_datetime(data_helper["date_global_quote"])
    data_helper.drop_duplicates(keep='first', inplace=True)

    #merge the list of all dates with the data we have
    data_with_missing_dates = pd.merge(pd.DataFrame({"date_global_quote": all_dates}), data_helper, on="date_global_quote", how="left")
    data_with_missing_dates.sort_values(by="date_global_quote", inplace=True) 
    data_with_missing_dates.interpolate(method='linear', inplace=True)

    data[todo_data] = data_with_missing_dates[todo_data]
    
except Exception as e:
    logger("Error: " + str(e))

Add days of week in numbers 0-6, add a mean price, a price change

In [5]:
try:
    data['weekday'] = data['date'].dt.dayofweek

    data['mean_price'] = data[['open_price', 'low_price', 'high_price', 'closing_price']].mean(axis=1)

    data["next_day_percentage"] = (data["mean_price"].shift(-1) / data["mean_price"] - 1)*100
    data.fillna({'next_day_percentage': 0}, inplace=True)

    data["price_change"] = data["mean_price"] / data["mean_price"].shift(1)
    data.fillna({'price_change': 1}, inplace=True)
    
except Exception as e:
    logger("Error: " + str(e))

Convert title list to numbers using finbert

Download the needed models

In [6]:
try:
    from transformers import BertTokenizer, BertForSequenceClassification, pipeline
    if not os.path.exists('.venv/Transformer'):
        finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
        finbert.save_pretrained('.venv/Transformer/Finbert_Offline')

        tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
        tokenizer.save_pretrained('.venv/Transformer/Tokenizer_Offline')
        logger("Transfomers were downloaded because no folder was found")
    else: logger(".venv/Transformer were found and reused offline")
except Exception as e:
    logger("Error: " + str(e))

Logger Message: .venv/Transformer were found and reused offline


In [7]:
import ast
try:
    finbert = BertForSequenceClassification.from_pretrained('.venv/Transformer/Finbert_Offline',num_labels=3)
    tokenizer = BertTokenizer.from_pretrained('.venv/Transformer/Tokenizer_Offline')
    
    nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

    def finbert(input_string):
        # Return 0 immediately if input is "0"
        if input_string == "0":
            return 0

        text_list = ast.literal_eval(input_string)  # Convert string to list
        if not text_list:
            return 0

        scores = []
        for text in text_list:
            # Truncate text to fit tokenization limit
            while len(tokenizer.tokenize(text)) > 500:
                text = text[:-1]

            # Analyze text with finbert
            results = nlp(text)
            for item in results:
                score = item['score']
                label = item['label']
                # Convert label to numerical value and multiply by score
                if label == "Neutral":
                    scores.append(0)
                elif label == "Negative":
                    scores.append(-10 * score)
                elif label == "Positive":
                    scores.append(10 * score)
        # Calculate mean if scores list is not empty
        return sum(scores) / len(scores) if scores else 0

except Exception as e:
    logger("Error: " + str(e))

In [8]:
try:
    data_finbert = pd.DataFrame()
    data_finbert[["com_title_finbert", "ceo_title_finbert"]] = data[["com_title_list", "ceo_title_list"]].apply(lambda col: col.map(finbert))
    logger("Finbert was applied to the data")
except Exception as e:
    logger("Error: " + str(e))

Logger Message: Finbert was applied to the data


Merge Finbert data. Data can be reuses like this

In [9]:
try: 
    data[["com_title_finbert","ceo_title_finbert"]] = data_finbert[["com_title_finbert","ceo_title_finbert"]]
except Exception as e:
    logger("Error: " + str(e))

currency exchange is the only thing live so we need to adjust the date because everything else is from yesterday

In [10]:
data["currency_exchange_rate"] = data["currency_exchange_rate"].shift(1)
data["currency_exchange_rate"].bfill(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["currency_exchange_rate"].bfill(inplace=True)


Delete all the columns that are a date

In [11]:
data.fillna(0.0, inplace=True)
data_no_dates = data.drop(columns=data.filter(like='date').columns)

Now we check if columns are numbers or text or dont change to much and then delete them

In [12]:
try:
    features = []
    needed_unique = 5

    for column in data_no_dates.columns[1:]:#sorts the columns in number, text, norchanging
        if pd.to_numeric(data_no_dates[column], errors='coerce').notnull().all():# Check if the column is numeric
            data_no_dates[column] = pd.to_numeric(data_no_dates[column])# Convert to number
            if data_no_dates[column].nunique() >= needed_unique:#check if all calues are the same. if so we dont need them    
                features.append(column)

    data_features = data_no_dates[features]
    logger("Features: " + str(features))
except Exception as e:
    logger("Error: " + str(e))

Logger Message: Features: ['ceo_news_amount', 'alpha_news_amount', 'alpha_news_sentiment_mean', 'open_price', 'high_price', 'low_price', 'closing_price', 'volume', 'currency_exchange_rate', 'cpi', 'eps', 'retail_sales', 'market_capitalization', 'pe_ratio', 'peg_ratio', 'eps.1', 'diluted_eps_ttm', 'analyst_target_price', 'trailing_pe', 'forward_pe', 'price_to_sales_ratio_ttm', 'price_to_book_ratio', 'ev_to_revenue', 'ev_to_ebitda', 'week_low_52', 'day_moving_average_50', 'day_moving_average_200', 'weekday', 'mean_price', 'next_day_percentage', 'price_change', 'com_title_finbert', 'ceo_title_finbert']


Standardize the data by subtracting the mean and dividing by the standard deviation (Z-Score Normalization)

In [13]:
try:
    data_normalized = data_features.sub(data_features.mean(axis=0), axis=1).div((data_features.max(axis=0)-data_features.min(axis=0)), axis=1)
    logger("Data normalized with Z-Score")
except Exception as e:
    logger("Error: " + str(e))

Logger Message: Data normalized with Z-Score


Add the target column which is the next days price change in percentage. So it the price will rise 5% the next day, the target will be 5

Plot all features in one plot if you want

In [14]:
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt

plt.figure()
for feature in features:
    plt.plot(data['date'], data_normalized[feature])
plt.title('Features')
plt.xlabel('Date')
plt.ylabel('Value')
every_fifth_date = data['date'][::5]
plt.xticks(every_fifth_date, rotation=90)
plt.savefig('output/Features.png')

Training and testing data split

In [15]:
try:

    from sklearn.model_selection import train_test_split
    target = data['next_day_percentage']
    features = data_normalized.drop(['next_day_percentage'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
except Exception as e:
    logger("Error: " + str(e))

Make a list of ML Models that are then tested

In [16]:
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

models = [
    {
        "name": "Linear Regression",
        "model": LinearRegression(),
        "param_grid": {}
    },
    {
        "name": "Decision Tree",
        "model": DecisionTreeRegressor(),
        "param_grid": {}
    },
    {
        "name": "Random Forest",
        "model": RandomForestRegressor(),
        "param_grid": {
            'n_estimators': [100, 200, 300, 1000],
            'max_features': [None, 'sqrt', 'log2'],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'bootstrap': [True, False]
        }
    },
    {
        "name": "Gradient Boosting",
        "model": GradientBoostingRegressor(),
        "param_grid": {
            'n_estimators': [100, 200, 300, 1000],
            'max_features': [None, 'sqrt', 'log2'],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
        }
    },
    {
        "name": "SVR",
        "model": SVR(),
        "param_grid": {
            'C': [0.1, 1, 10, 100, 1000],
            'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
            'kernel': ['linear', 'rbf']
        }
    },
    {
        "name": "KNN",
        "model": KNeighborsRegressor(),
        "param_grid": {
            'n_neighbors': [3, 5, 11, 19],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    },
    {
        "name": "Neural Network",
        "model": MLPRegressor(),
        "param_grid": {
            'hidden_layer_sizes': [(100,), (50, 50), (100, 50, 25)],
            'activation': ['relu', 'tanh'],
            'solver': ['adam', 'lbfgs'],
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'learning_rate': ['constant', 'adaptive']
        }
    }
]

Find the best model

In [17]:
try:
        
    from sklearn.model_selection import RandomizedSearchCV, KFold
    from sklearn.metrics import mean_squared_error

    best_model = None

    for model_info in models:
        name = model_info["name"]
        model = model_info["model"]
        param_grid = model_info["param_grid"]
        
        estimator = model
        kfold = KFold(n_splits=10, random_state=42, shuffle=True)
        random_search = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, cv=kfold, n_jobs=-1, verbose=2)
        random_search.fit(X_train, y_train)
        model = model.__class__(**random_search.best_params_)

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        logger("Model:" + name + ", Mean Squared Error:" + str(mse))
        
        if best_model is None or mse < best_model["mse"]:
            best_model = {
                "name": name,
                "model": model,
                "mse": mse
            }
except Exception as e:
    logger("Error: " + str(e))

Fitting 10 folds for each of 1 candidates, totalling 10 fits




[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
Logger Message: Model:Linear Regression, Mean Squared Error:3.757143524925244
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END ..........................................



[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
Logger Message: Model:Decision Tree, Mean Squared Error:20.11164290189924
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.7s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.8s
[CV] END bootstr



[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.4s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.5s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.6s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.3s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   0.5s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.9s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.9s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.8s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   1.0s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   1.0s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.9s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   1.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   1.0s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.9s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.8s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.8s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.9s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   1.0s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   1.0s




[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   1.0s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   1.1s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.7s




[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.6s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.6s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s




[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100, 50, 25), learning_rate=constant, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.5s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s




[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(50, 50), learning_rate=adaptive, solver=adam; total time=   0.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.8s
[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s
[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s
[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.1, hidden_layer_sizes=(50, 50), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.2s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.4s
[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   0.3s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.7s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.8s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.7s
[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100, 50, 25), learning_rate=adaptive, solver=adam; total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.01, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=lbfgs; total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.7s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.8s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.6s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.4s
[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=lbfgs; total time=   0.4s
Logger Message: Model:Neural Network, Mean Squared Error:2.351105350155872




Get the best model

In [18]:
model = best_model["model"]
mse = best_model["mse"]
logger("Best model: " + str(model) + " Mean Squared Error: " + str(mse))

Logger Message: Best model: GradientBoostingRegressor(learning_rate=0.01, max_depth=100,
                          max_features='sqrt', min_samples_leaf=4,
                          min_samples_split=10) Mean Squared Error: 1.3081516680353424


Predict all the values

In [19]:
data["next_day_percentage_predicted"] = model.predict(features)
data["diff_percentage"] = abs((data["next_day_percentage"] - data["next_day_percentage_predicted"]) / data["next_day_percentage"]) * 100
mean_diff = data["diff_percentage"][:-1].mean()

logger("Mean Difference between predicted and real is: " + str(mean_diff))

Logger Message: Mean Difference between predicted and real is: 126.36637329310166


Plot the next days price change in percent vs the predicted values from the best model

In [20]:
plt.figure()

plt.plot(data['date'], data[["next_day_percentage", "next_day_percentage_predicted"]])
plt.title('Next Days Percentage\nMean Diff: ' + str(mean_diff))
plt.xlabel('Date')
plt.ylabel('Value')
every_fifth_date = data['date'][::5]
plt.xticks(every_fifth_date, rotation=90)
plt.savefig('output/Predictions.png')

Now we need to find the best threshold of the data to make the best predictions

In [21]:
import optuna
iterations = 1000
stock_value = 1000
money_value = 0
fee = 1
sell_percentage = 0.5
data["depot_value"] = 0.0

In [22]:
def optuna_loop(trial, data, stock_value, money_value, sell_percentage):
    buy_threshold = trial.suggest_float('buy_threshold', 0, 5)
    sell_threshold = trial.suggest_float('sell_threshold', -5, 0)

    data["depot_value"] =  broker(stock_value, money_value, buy_threshold, sell_threshold, sell_percentage, data)
    last_depot_value = data['depot_value'].iloc[-1]
    return last_depot_value

def broker(stock_value, money_value, buy_threshold, sell_threshold, sell_percentage, data):
    stock_value = stock_value
    money_value = money_value
    depot_value = 0

    for index, row in data.iterrows():
        stock_value *= row["price_change"]
        if row['next_day_percentage_predicted'] >= buy_threshold:
            if money_value > 0:
                money_value -= fee
                stock_value += money_value
                money_value = 0
        elif row['next_day_percentage_predicted'] <= sell_threshold:
                if stock_value > 0:
                    money_value += stock_value * sell_percentage
                    stock_value -= stock_value * sell_percentage
                    money_value -= fee
        depot_value = stock_value + money_value
        data.at[index, 'depot_value'] = depot_value
    return data["depot_value"]


In [23]:
try: 
    study = optuna.create_study(direction='maximize')
    for _ in range(iterations):
        study.optimize(lambda trial: optuna_loop(trial, data, stock_value, money_value, sell_percentage), n_trials=1)
except Exception as e:
    logger("Error: " + str(e))

[I 2024-03-23 16:53:09,602] A new study created in memory with name: no-name-e83dc758-dd6b-44ba-af6c-da6b5fb87f3b
[I 2024-03-23 16:53:09,618] Trial 0 finished with value: 810.403021657894 and parameters: {'buy_threshold': 4.778732039394701, 'sell_threshold': -4.405982415559931}. Best is trial 0 with value: 810.403021657894.
[I 2024-03-23 16:53:09,629] Trial 1 finished with value: 865.6372001446541 and parameters: {'buy_threshold': 0.9095857113928063, 'sell_threshold': -2.72396319756493}. Best is trial 1 with value: 865.6372001446541.
[I 2024-03-23 16:53:09,639] Trial 2 finished with value: 865.6372001446541 and parameters: {'buy_threshold': 1.9517623080978037, 'sell_threshold': -2.52138927789382}. Best is trial 1 with value: 865.6372001446541.
[I 2024-03-23 16:53:09,650] Trial 3 finished with value: 865.6372001446541 and parameters: {'buy_threshold': 0.5200892861142797, 'sell_threshold': -2.3451097323124555}. Best is trial 1 with value: 865.6372001446541.
[I 2024-03-23 16:53:09,661] Tr

In [24]:
best_params = study.best_trial.params
buy_threshold = round(best_params['buy_threshold'],3)
sell_threshold = round(best_params['sell_threshold'],3)
stock_value = 1000
money_value = 0
logger("Buy threshold: " + str(buy_threshold) + "%, Sell threshold: " + str(sell_threshold)+"%")

Logger Message: Buy threshold: 0.019%, Sell threshold: -0.114%


In [25]:
try: 
    data["buy_or_sell"] = data["next_day_percentage_predicted"].apply(lambda x: 1 if x >= buy_threshold else (-1 if x <= sell_threshold else 0))
    data["depot_value"] = broker(stock_value, money_value, buy_threshold, sell_threshold, sell_percentage, data)
except Exception as e:
    logger("Error: " + str(e))

In [26]:
hold_percentage = round(((data['mean_price'].iloc[-1] / data['mean_price'].iloc[0]) - 1) * 100, 2)
spai_percentage = round(((data['depot_value'].iloc[-1] / data['depot_value'].iloc[0]) - 1) * 100, 2)

logger("Hold percentage: " + str(hold_percentage) + "%, SPAI percentage: " + str(spai_percentage)+"%")

Logger Message: Hold percentage: -18.96%, SPAI percentage: 19.83%


bring them to the same level

In [27]:
data['mean_price'] = data['mean_price'] / data['mean_price'].iloc[0]
data['depot_value'] = data['depot_value'] / data['depot_value'].iloc[0]

In [28]:
#get last value of buy_or_sell
today_buy_or_sell = data['buy_or_sell'].iloc[-1]
today = data['date'].iloc[-1]

if today_buy_or_sell == 1:
    print("Buy signal today ", today)
elif today_buy_or_sell == -1:
    print("Sell signal today", today)
elif today_buy_or_sell == 0:
    print("Hold signal today", today)

logger("Signal for today: " + str(today_buy_or_sell) + ", Date: " + str(today))

Sell signal today 2024-03-22 00:00:00
Logger Message: Signal for today: -1, Date: 2024-03-22 00:00:00


In [29]:
title = f"Hold: {hold_percentage}% vs. SPAI: {spai_percentage}% \n Buy threshold: {buy_threshold}, Sell threshold: {sell_threshold})"
plt.figure(figsize=(10, 6))

plt.plot(data['date'], data["mean_price"], label='Holding', marker='o', color='blue')
plt.plot(data['date'], data["depot_value"], label='Using SPAI', marker='x', color='purple')

for index, row in data.iterrows():
    if row['buy_or_sell'] >= 1:
        plt.bar(row['date'], height=0.1, bottom=row["mean_price"] - 0.05, color='green', width=1, alpha=0.5)
    elif row['buy_or_sell'] <= -1:
        plt.bar(row['date'], height=0.1, bottom=row["mean_price"] - 0.05, color='red', width=1, alpha=0.5)
plt.title(title)
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
every_fifth_date = data['date'][::5]
plt.xticks(every_fifth_date, rotation=90)
plt.savefig('output/Simulation.png')

In [30]:
#save data to Output folder
data.to_csv('output/data_processed.csv', index=False)

In [31]:
time_end = pd.Timestamp.now()
time_delta = time_end - time_start
logger("Script ended: " + str(time_start))
logger("Script needed: " + str(time_delta))

Logger Message: Script ended: 2024-03-23 16:51:37.672885
Logger Message: Script needed: 0 days 00:01:55.306064
