# Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, fbeta_score

# Lade den Datensatz
train_data_loaded = pd.read_csv('../data/train_data_2024-08-01.csv')
X = train_data_loaded.drop(columns=['UKATEGORIE'])
y = train_data_loaded['UKATEGORIE']

# Define the f-beta scorer
fbeta_scorer = make_scorer(fbeta_score, beta=2)

# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Model 
rfmodel = RandomForestClassifier(class_weight={0: 1, 1: 9}, random_state=42)


# crossvalscore für Bias-Variance-Analyse

rf_scores = cross_val_score(rfmodel, X, y, cv=skf, scoring=fbeta_scorer)
print(f'Random Forest F-beta Score (mean): {rf_scores.mean()}')
print(f'Random Forest F-beta Score (std): {rf_scores.std()}')



# Train Ridge model on the full dataset for feature importance
rfmodel.fit(X, y)

#Extrahiere die Feature Importances

feature_importances = rfmodel.feature_importances_

# Erstelle ein DataFrame der Features und ihrer Importances

importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sortiere das DataFrame nach Importances

importancesdf = importances_df.sort_values(by='Importance', ascending=False)

#Identifiziere die wichtigsten Features

top_features = importancesdf.head(5)
print(top_features)

# Berechne und drucke die Korrelationsmatrix der Top Features

Xtop_features = X[top_features['Feature']]
correlation_matrix = Xtop_features.corr()
print("Korrelationsmatrix der Top Features:")
print(correlation_matrix)

Random Forest F-beta Score (mean): 0.05721014214538493
Random Forest F-beta Score (std): 0.010311509625039694
      Feature  Importance
3     USTUNDE    0.190419
0         BEZ    0.140222
4  UWOCHENTAG    0.127451
2      UMONAT    0.115603
6       UTYP1    0.068989
Korrelationsmatrix der Top Features:
             USTUNDE       BEZ  UWOCHENTAG    UMONAT     UTYP1
USTUNDE     1.000000 -0.019728    0.016865  0.000265  0.032250
BEZ        -0.019728  1.000000    0.004234 -0.001105 -0.006259
UWOCHENTAG  0.016865  0.004234    1.000000  0.011848  0.001495
UMONAT      0.000265 -0.001105    0.011848  1.000000 -0.005130
UTYP1       0.032250 -0.006259    0.001495 -0.005130  1.000000
