In [1]:
#Hilfscode zum finden meines Arbeitsverzeichnis und der vorhanden Dateien
import os
print("Aktuelles Arbeitsverzeichnis:", os.getcwd())

input_dir = '/kaggle/input/'

for root, dirs, files in os.walk(input_dir):
    print(f"Verzeichnis: {root}")
    print(f"Unterordner: {dirs}")
    print(f"Dateien: {files}")
    print('---')

Aktuelles Arbeitsverzeichnis: /kaggle/working
Verzeichnis: /kaggle/input/
Unterordner: []
Dateien: []
---


In [29]:
import pandas as pd
import numpy as np

# Configuration dictionary for file paths
config = {
    'excel_path_complete': '/kaggle/input/datasets-training/DS_14_t_3days_complete.xlsx',
    'excel_path_fin': '/kaggle/input/datasets-training/DS_14_t_3days.xlsx',
    'excel_path_sentiment': '/kaggle/input/datasets-training/ecb_sentiment_analysis.xlsx',
    'excel_path_sentiment_sum': '/kaggle/input/datasets-training/ecb_sentiment_analysis_sum.xlsx',
    'remove_dates': ['2024-12-12','2022-06-09']  # Ausreißer entfernen
}

# Daten laden
df_all = pd.read_excel(config['excel_path_complete'])
df_all['Date'] = pd.to_datetime(df_all['Date'], dayfirst=True)
df_all.set_index('Date', inplace=True)

# Convert One-Hot-Encoded columns to float64 for consistency 
onehot_cols = ['Index_DAX', 'Index_MDAX', 'Index_SDAX']
df_all[onehot_cols] = df_all[onehot_cols].astype(np.float64)

# Ausreißer entfernen
remove_dates = pd.to_datetime(config['remove_dates'])
df_filtered = df_all.loc[~df_all.index.isin(remove_dates)]

# FEATURE ENGINEERING
price_columns = ['Close_t-4', 'Close_t-3', 'Close_t-2']
feature_columns_with_old = ['Index_MDAX', 'Index_SDAX', 'Interest Rate_Old', 'Interest Rate_Change']
sentiment_columns = ['FinBERT_Sentences', 'FinBERT_Chunks', 'RoBERTa_Sentences', 'RoBERTa_Chunks']
base_columns = price_columns + feature_columns_with_old
target_columns = ['Close', 'Close_t+1', 'Close_t+2']

# Targets erstellen (prozentuale Änderungen)
df_targets = df_filtered.copy()
for col in target_columns:
    df_targets[col] = (df_filtered[col] - df_filtered['Close_t-1']) / df_filtered['Close_t-1'] * 100
    
# Features skalieren (prozentuale Änderungen für Preisspalten)
df_features = df_filtered[base_columns + sentiment_columns].copy()
for col in price_columns:
    df_features[col] = (df_filtered[col] - df_filtered['Close_t-1']) / df_filtered['Close_t-1'] * 100

# VERSCHIEDENE DATENSÄTZE ERSTELLEN
###
# Kombiniertes Dataset mit allen Features erstellen
df_data = df_features[base_columns + sentiment_columns].copy()

# Targets hinzufügen
for col in target_columns:
    df_data[col] = df_targets[col]
###

# 1. Base Dataset (ohne Sentiment)
df_base = df_features[base_columns].copy()
# Targets hinzufügen
for col in target_columns:
    df_base[col] = df_targets[col]

# 2. Dataset mit FinBERT Sentences
df_fin_sen = df_features[base_columns + ['FinBERT_Sentences']].copy()
for col in target_columns:
    df_fin_sen[col] = df_targets[col]

# 3. Dataset mit FinBERT Chunks
df_fin_chk = df_features[base_columns + ['FinBERT_Chunks']].copy()
for col in target_columns:
    df_fin_chk[col] = df_targets[col]

# 4. Dataset mit RoBERTa Sentences
df_rob_sen = df_features[base_columns + ['RoBERTa_Sentences']].copy()
for col in target_columns:
    df_rob_sen[col] = df_targets[col]

# 5. Dataset mit RoBERTa Chunks
df_rob_chk = df_features[base_columns + ['RoBERTa_Chunks']].copy()
for col in target_columns:
    df_rob_chk[col] = df_targets[col]

# DATENSÄTZE ALS EXCEL-DATEIEN EXPORTIEREN
print("Exportiere Datensätze...")
df_data.to_excel('dataset.xlsx')
df_base.to_excel('dataset_base.xlsx')
df_fin_sen.to_excel('dataset_finbert_sentences.xlsx')
df_fin_chk.to_excel('dataset_finbert_chunks.xlsx')
df_rob_sen.to_excel('dataset_roberta_sentences.xlsx')
df_rob_chk.to_excel('dataset_roberta_chunks.xlsx')

print("Alle Datensätze wurden erfolgreich exportiert!")
print(f" Dataset Shape: {df_data.shape}")
print(f"Base Dataset Shape: {df_base.shape}")
print(f"FinBERT Sentences Dataset Shape: {df_fin_sen.shape}")
print(f"FinBERT Chunks Dataset Shape: {df_fin_chk.shape}")
print(f"RoBERTa Sentences Dataset Shape: {df_rob_sen.shape}")
print(f"RoBERTa Chunks Dataset Shape: {df_rob_chk.shape}")



Exportiere Datensätze...
Alle Datensätze wurden erfolgreich exportiert!
 Dataset Shape: (69, 14)
Base Dataset Shape: (69, 10)
FinBERT Sentences Dataset Shape: (69, 11)
FinBERT Chunks Dataset Shape: (69, 11)
RoBERTa Sentences Dataset Shape: (69, 11)
RoBERTa Chunks Dataset Shape: (69, 11)


In [23]:
df_data

Unnamed: 0_level_0,Close_t-4,Close_t-3,Close_t-2,Index_MDAX,Index_SDAX,Interest Rate_Old,Interest Rate_Change,FinBERT_Sentences,FinBERT_Chunks,RoBERTa_Sentences,RoBERTa_Chunks,Close,Close_t+1,Close_t+2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-07-21,-3.141550,-2.425617,0.198991,0.0,0.0,-0.50,0.00,0.260,0.245,0.094,0.231,-0.266075,-0.213071,-0.539528
2022-07-21,-4.265666,-2.903723,-0.677581,1.0,0.0,-0.50,0.00,0.260,0.245,0.094,0.231,-0.150470,0.299553,-0.238834
2022-07-21,-4.213593,-2.455096,-0.800936,0.0,1.0,-0.50,0.00,0.260,0.245,0.094,0.231,0.188718,0.348937,-0.145770
2022-09-08,1.039798,-1.201536,-0.344767,0.0,0.0,0.00,0.50,0.012,-0.006,-0.174,-0.245,-0.090198,1.333543,3.765106
2022-09-08,0.663098,-1.308234,-0.444066,1.0,0.0,0.00,0.50,0.012,-0.006,-0.174,-0.245,0.519197,2.818977,4.762705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-04-17,-5.308845,-2.175038,0.172745,1.0,0.0,2.50,-0.25,-0.095,-0.410,-0.036,0.064,-0.260367,0.216097,1.630939
2025-04-17,-4.615226,-1.752020,0.411304,0.0,1.0,2.50,-0.25,-0.095,-0.410,-0.036,0.064,-0.209780,-0.233757,1.529228
2025-06-05,-1.149261,-1.424465,-0.761478,0.0,0.0,2.25,-0.25,0.118,-0.095,0.031,-0.043,0.194015,0.115256,-0.420819
2025-06-05,-1.403263,-0.882151,-0.826560,1.0,0.0,2.25,-0.25,0.118,-0.095,0.031,-0.043,0.340866,-0.498488,-0.509284
