In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Datensatz laden
df = pd.read_csv("processed_dataset.csv")

# 2. Features und Zielvariable trennen
X = df.drop(columns=['Type of Answer'])  # Alle Spalten außer der Zielvariable
y = df['Type of Answer']                # Zielvariable

# 3. Trainings- und Testdaten erstellen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. RandomForestClassifier trainieren
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# 5. Vorhersagen machen
y_pred = rf.predict(X_test)

# 6. Modellbewertung
print("Genauigkeit des Modells:", accuracy_score(y_test, y_pred))
print("\nKlassifikationsbericht:")
print(classification_report(y_test, y_pred))

# Optional: Wichtigkeit der Features anzeigen
importances = pd.Series(rf.feature_importances_, index=X.columns)
print("\nWichtigste Features:")
print(importances.sort_values(ascending=False))


Genauigkeit des Modells: 0.5931937172774869

Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.61      0.60      0.61      1002
           1       0.57      0.58      0.58       908

    accuracy                           0.59      1910
   macro avg       0.59      0.59      0.59      1910
weighted avg       0.59      0.59      0.59      1910


Wichtigste Features:
Student ID                         0.531538
Question ID                        0.223792
Student Country                    0.085377
Question Level                     0.011054
Topic                              0.005621
                                     ...   
complement of a set                0.000051
venn diagram                       0.000049
partial fractions decomposition    0.000034
sensitivity analysis               0.000022
direct integrals                   0.000022
Length: 225, dtype: float64


In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Hyperparameter-Raster für GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# RandomForestClassifier mit GridSearchCV
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Bestes Modell anzeigen
best_rf = grid_search.best_estimator_
print("Beste Parameter:", grid_search.best_params_)

# Vorhersagen mit dem besten Modell
y_pred = best_rf.predict(X_test)

# Modellbewertung
print("Genauigkeit des optimierten Modells:", accuracy_score(y_test, y_pred))
print("\nKlassifikationsbericht:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Beste Parameter: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Genauigkeit des optimierten Modells: 0.6047120418848168

Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.60      0.75      0.67      1002
           1       0.62      0.44      0.52       908

    accuracy                           0.60      1910
   macro avg       0.61      0.60      0.59      1910
weighted avg       0.61      0.60      0.59      1910



In [3]:
# RandomForest mit ausgewogenem Klassen-Gewicht
rf_balanced = RandomForestClassifier(
    random_state=42,
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=2,
    min_samples_split=5,
    class_weight='balanced'  # Gewichtung hinzufügen
)

rf_balanced.fit(X_train, y_train)
y_pred_balanced = rf_balanced.predict(X_test)

# Bewertung
print("Genauigkeit mit Klassen-Gewichtung:", accuracy_score(y_test, y_pred_balanced))
print("\nKlassifikationsbericht mit Klassen-Gewichtung:")
print(classification_report(y_test, y_pred_balanced))


Genauigkeit mit Klassen-Gewichtung: 0.6094240837696335

Klassifikationsbericht mit Klassen-Gewichtung:
              precision    recall  f1-score   support

           0       0.62      0.66      0.64      1002
           1       0.60      0.56      0.58       908

    accuracy                           0.61      1910
   macro avg       0.61      0.61      0.61      1910
weighted avg       0.61      0.61      0.61      1910



In [4]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

# SMOTE anwenden
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# RandomForest trainieren
rf_smote = RandomForestClassifier(
    random_state=42,
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=2,
    min_samples_split=5
)
rf_smote.fit(X_train_resampled, y_train_resampled)
y_pred_smote = rf_smote.predict(X_test)

# Bewertung
print("Genauigkeit nach SMOTE:", accuracy_score(y_test, y_pred_smote))
print("\nKlassifikationsbericht nach SMOTE:")
print(classification_report(y_test, y_pred_smote))


Genauigkeit nach SMOTE: 0.6068062827225131

Klassifikationsbericht nach SMOTE:
              precision    recall  f1-score   support

           0       0.63      0.62      0.62      1002
           1       0.59      0.59      0.59       908

    accuracy                           0.61      1910
   macro avg       0.61      0.61      0.61      1910
weighted avg       0.61      0.61      0.61      1910



In [17]:
# Features mit unterschiedlichen Schwellenwerten auswählen und testen
for threshold in [0.001, 0.002, 0.003, 0.005]:
    important_features = importances[importances > threshold].index
    X_train_reduced = X_train[important_features]
    X_test_reduced = X_test[important_features]
    
    rf_reduced = RandomForestClassifier(
        random_state=42,
        n_estimators=300,
        max_depth=20,
        min_samples_leaf=2,
        min_samples_split=5
    )
    rf_reduced.fit(X_train_reduced, y_train)
    y_pred_reduced = rf_reduced.predict(X_test_reduced)
    
    print(f"\nSchwellenwert: {threshold}")
    print("Genauigkeit:", accuracy_score(y_test, y_pred_reduced))
    print(classification_report(y_test, y_pred_reduced))



Schwellenwert: 0.001
Genauigkeit: 0.6057591623036649
              precision    recall  f1-score   support

           0       0.61      0.68      0.64      1002
           1       0.60      0.53      0.56       908

    accuracy                           0.61      1910
   macro avg       0.60      0.60      0.60      1910
weighted avg       0.60      0.61      0.60      1910


Schwellenwert: 0.002
Genauigkeit: 0.6308900523560209
              precision    recall  f1-score   support

           0       0.64      0.68      0.66      1002
           1       0.62      0.58      0.60       908

    accuracy                           0.63      1910
   macro avg       0.63      0.63      0.63      1910
weighted avg       0.63      0.63      0.63      1910


Schwellenwert: 0.003
Genauigkeit: 0.6408376963350786
              precision    recall  f1-score   support

           0       0.65      0.67      0.66      1002
           1       0.62      0.61      0.62       908

    accuracy        

In [16]:
# Features mit geringer Bedeutung entfernen
important_features = importances[importances > 0.003].index
X_train_reduced = X_train[important_features]
X_test_reduced = X_test[important_features]

# Modell mit reduzierten Features trainieren
rf_reduced = RandomForestClassifier(
    random_state=42,
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=2,
    min_samples_split=5
)
rf_reduced.fit(X_train_reduced, y_train)
y_pred_reduced = rf_reduced.predict(X_test_reduced)

print("Genauigkeit mit reduzierten Features:", accuracy_score(y_test, y_pred_reduced))
print("\nKlassifikationsbericht mit reduzierten Features:")
print(classification_report(y_test, y_pred_reduced))


Genauigkeit mit reduzierten Features: 0.6408376963350786

Klassifikationsbericht mit reduzierten Features:
              precision    recall  f1-score   support

           0       0.65      0.67      0.66      1002
           1       0.62      0.61      0.62       908

    accuracy                           0.64      1910
   macro avg       0.64      0.64      0.64      1910
weighted avg       0.64      0.64      0.64      1910

