In [1]:
from sklearn.datasets import make_classification

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
import pandas as pd
from sklearn.metrics import make_scorer, fbeta_score



In [2]:
df = pd.read_csv('GeneralDatensatz18-21ohneGeo-mitLockdown_mitCorona.csv', sep=';')

X=df[['UMONAT','USTUNDE','UWOCHENTAG','UART','USTRZUSTAND','BEZ','UTYP1','ULICHTVERH','IstRad','IstPKW','IstFuss','IstKrad','IstGkfz','IstSonstige', 'LOCKDOWN', 'COVID']]
#für tödliche Unfälle

#für tödliche und schwere vs. leichte Unfälle  -> 1 ist schwer oder tödlich, 0 ist leicht
y = df['UKATEGORIE'].isin([1, 2]).astype(int)

In [3]:
beta = 2
fbeta_scorer= make_scorer(fbeta_score, beta=beta)

In [4]:
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 11),
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': ['balanced', 'balanced_subsample', {0: 1, 1: 10}]
}

stratified_kfold = StratifiedKFold(n_splits=5)
rf = RandomForestClassifier()
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=stratified_kfold, scoring=fbeta_scorer)
random_search.fit(X, y)
best_params = random_search.best_params_

In [5]:
best_params

{'class_weight': {0: 1, 1: 10},
 'max_depth': 14,
 'max_features': 'sqrt',
 'min_samples_leaf': 10,
 'min_samples_split': 3,
 'n_estimators': 331}