In [50]:
# Importeer benodigde bibliotheken
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV



def fin_model_2(df, param_grid=None):
    """
    Verwerkt de dataset en retourneert signalen en gebalanceerde datasets.
    """
    # Reset index en verwerk data
    if getattr(df.index, 'name', None) == 'Date' or isinstance(df.index, pd.DatetimeIndex):
        df = df.reset_index()

    # Vereiste kolommen
    try:
        df2 = df[['Date', 'Close', 'High', 'Low', 'Open', 'Volume']].copy()
    except KeyError as e:
        raise KeyError(f"Een van de vereiste kolommen ontbreekt in de DataFrame: {e}")
    
    df2 = df2[df2.High != df2.Low]
    df2.reset_index(inplace=True, drop=True)

    # Bereken technische indicatoren
    df2['sma_50'] = df2['Close'].rolling(window=50).mean()
    df2['sma_200'] = df2['Close'].rolling(window=200).mean()
    df2['momentum'] = df2['Close'] - df2['Close'].shift(10)
    df2['volatility'] = df2['Close'].rolling(window=20).std()
    df2['roc'] = ((df2['Close'] - df2['Close'].shift(12)) / df2['Close'].shift(12)) * 100

    # Drop NaN-waarden
    df2 = df2.dropna()

    # Signal-logica
    df2['Signal_fin_model_2'] = 0
    for i in range(1, len(df2)):
        if (df2['Close'].iloc[i] > df2['sma_50'].iloc[i] and
            df2['sma_50'].iloc[i] > df2['sma_200'].iloc[i] and
            df2['momentum'].iloc[i] > 0 and
            df2['volatility'].iloc[i] > df2['volatility'].mean()):
            df2.at[i, 'Signal_fin_model_2'] = 1

    # Bereid de data voor modeltraining
    feature_columns = ['Close', 'High', 'Low', 'Open', 'Volume', 'momentum', 'volatility', 'roc']
    X = df2[feature_columns]
    y = df2['Signal_fin_model_2']

    # Schalen en balanceren
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    sm = SMOTE(random_state=42)
    X_balanced, y_balanced = sm.fit_resample(X_scaled, y)

    return df, X_balanced, y_balanced



In [56]:
df = pd.read_csv('..\df_fin_model_1.csv')
df

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume,Change,Target,MA_5,...,Money Stock (M2),VIX,Technology,Energy,Financials,Healthcare,10Yr_Treasury_Rate,Linear_model,Signal_fin_model_1,Probability_fin_model_1
0,2011-02-15,103.293663,133.009995,133.220001,132.320007,133.020004,119575400,-0.003148,0,132.828000,...,,16.370001,26.840000,74.949997,13.866775,32.419998,3.616,82.841288,,
1,2011-02-16,103.946014,133.850006,134.009995,133.190002,133.460007,130183500,0.006315,1,133.144000,...,,16.719999,26.990000,75.949997,13.939886,32.590000,3.621,82.956981,,
2,2011-02-17,104.256584,134.250000,134.429993,133.339996,133.460007,109810500,0.002988,1,133.529999,...,,16.590000,27.040001,76.699997,13.923639,32.730000,3.574,83.072675,,
3,2011-02-18,104.474052,134.529999,134.690002,134.059998,134.369995,130002400,0.002086,1,133.813998,...,,16.430000,27.000000,77.019997,13.956133,32.810001,3.589,83.188368,,
4,2011-02-22,102.377258,131.830002,134.559998,131.470001,133.119995,233116400,-0.020070,0,133.494000,...,,20.799999,26.340000,76.269997,13.525589,32.310001,3.461,83.304062,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3472,2024-12-03,603.909973,603.909973,604.159973,602.340027,603.390015,26906600,0.000464,1,601.914001,...,21311.2,13.300000,236.800003,94.510002,50.450001,146.820007,4.223,484.529218,0.0,0.106271
3473,2024-12-04,607.659973,607.659973,607.909973,604.950012,605.630005,42787600,0.006210,1,603.315991,...,21311.2,13.450000,241.130005,92.230003,50.259998,146.740005,4.180,484.644911,0.0,0.091987
3474,2024-12-05,606.659973,606.659973,608.479980,606.299988,607.659973,28762200,-0.001646,0,604.881982,...,21311.2,13.540000,239.910004,92.589996,50.410000,145.100006,4.180,484.760605,0.0,0.095634
3475,2024-12-06,607.809998,607.809998,609.070007,607.020020,607.440002,31241500,0.001896,1,605.933984,...,21311.2,12.770000,240.839996,91.019997,50.400002,144.279999,4.151,484.876298,0.0,0.090502


In [57]:
df= fin_model_2(df)
df

Aantal rijen vóór dropna: 3477
Date            0
Close           0
High            0
Low             0
Open            0
Volume          0
sma_50         49
sma_200       199
momentum       10
volatility     19
roc            12
dtype: int64
Aantal rijen na dropna: 3278


(            Date   Adj Close       Close        High         Low        Open  \
 0     2011-02-15  103.293663  133.009995  133.220001  132.320007  133.020004   
 1     2011-02-16  103.946014  133.850006  134.009995  133.190002  133.460007   
 2     2011-02-17  104.256584  134.250000  134.429993  133.339996  133.460007   
 3     2011-02-18  104.474052  134.529999  134.690002  134.059998  134.369995   
 4     2011-02-22  102.377258  131.830002  134.559998  131.470001  133.119995   
 ...          ...         ...         ...         ...         ...         ...   
 3472  2024-12-03  603.909973  603.909973  604.159973  602.340027  603.390015   
 3473  2024-12-04  607.659973  607.659973  607.909973  604.950012  605.630005   
 3474  2024-12-05  606.659973  606.659973  608.479980  606.299988  607.659973   
 3475  2024-12-06  607.809998  607.809998  609.070007  607.020020  607.440002   
 3476  2024-12-09  604.679993  604.679993  607.859985  604.080017  607.690002   
 
          Volume    Change