In [1]:
import pandas as pd

# Den zuvor gespeicherten Datensatz laden
df = pd.read_csv("processed_dataset.csv")

# Features (X) und Zielvariable (y) definieren
# Zielvariable hier als Beispiel: 'Type of Answer'
X = df.drop(columns=['Type of Answer'])  # Features
y = df['Type of Answer']  # Zielvariable

print("Daten erfolgreich geladen.")
print(f"Feature-Matrix (X): {X.shape}")
print(f"Zielvariable (y): {y.shape}")


Daten erfolgreich geladen.
Feature-Matrix (X): (9546, 225)
Zielvariable (y): (9546,)


In [2]:
from sklearn.model_selection import train_test_split

# Daten in Trainings- und Testdaten aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")


Trainingsdaten: (7636, 225), Testdaten: (1910, 225)


In [3]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# XGBoost-Klassifizierer initialisieren
xgb_clf = XGBClassifier(random_state=42)

# Modell trainieren
xgb_clf.fit(X_train, y_train)

# Vorhersagen auf Testdaten
y_pred = xgb_clf.predict(X_test)

# Ergebnisse bewerten
print("Klassifikationsbericht:")
print(classification_report(y_test, y_pred))
print(f"Genauigkeit: {accuracy_score(y_test, y_pred):.2f}")


Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.66      0.68      0.67      1002
           1       0.63      0.61      0.62       908

    accuracy                           0.65      1910
   macro avg       0.64      0.64      0.64      1910
weighted avg       0.64      0.65      0.64      1910

Genauigkeit: 0.65


In [4]:
from sklearn.model_selection import GridSearchCV

# Parameter für die Suche definieren
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
}

# GridSearchCV einrichten
grid_search = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,  # 3-fache Kreuzvalidierung
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Modell trainieren
grid_search.fit(X_train, y_train)

# Beste Parameter ausgeben
print(f"Beste Parameter: {grid_search.best_params_}")

# Bestes Modell bewerten
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Klassifikationsbericht (Optimiertes Modell):")
print(classification_report(y_test, y_pred_best))
print(f"Genauigkeit: {accuracy_score(y_test, y_pred_best):.2f}")


Fitting 3 folds for each of 27 candidates, totalling 81 fits
Beste Parameter: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100}
Klassifikationsbericht (Optimiertes Modell):
              precision    recall  f1-score   support

           0       0.65      0.68      0.66      1002
           1       0.63      0.60      0.61       908

    accuracy                           0.64      1910
   macro avg       0.64      0.64      0.64      1910
weighted avg       0.64      0.64      0.64      1910

Genauigkeit: 0.64


In [None]:
param_grid_extended = {
    'max_depth': [5, 7, 9],
    'learning_rate': [0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'subsample': [0.7, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2],
}

grid_search_ext = GridSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_grid=param_grid_extended,
    cv=3,  # 3-fache Kreuzvalidierung
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search_ext.fit(X_train, y_train)

print(f"Beste Parameter (erweiterte Suche): {grid_search_ext.best_params_}")

best_model_ext = grid_search_ext.best_estimator_
y_pred_ext = best_model_ext.predict(X_test)
print("Klassifikationsbericht (Erweitertes Modell):")
print(classification_report(y_test, y_pred_ext))
print(f"Genauigkeit: {accuracy_score(y_test, y_pred_ext):.2f}")


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

param_distributions = {
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'subsample': [0.7, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 1.5, 2],
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=100,  # Anzahl der zufälligen Kombinationen
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
print(f"Beste Parameter: {random_search.best_params_}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Beste Parameter: {'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 1, 'colsample_bytree': 0.7}


In [11]:
!pip install optuna

import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 2),
    }

    model = XGBClassifier(**param, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print(f"Beste Parameter: {study.best_params}")


Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
Downloading alembic-1.14.0-py3-none-any.whl (233 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.8 alembic-1.14.0 colorlog-6.9.0 optuna-4.1.0


[I 2024-12-09 02:06:40,870] A new study created in memory with name: no-name-7cd4b00c-3a55-4155-b21b-bcc5743c5728
[I 2024-12-09 02:06:41,250] Trial 0 finished with value: 0.6196955970843745 and parameters: {'max_depth': 9, 'learning_rate': 0.19208896249113902, 'n_estimators': 164, 'colsample_bytree': 0.513829268781407, 'subsample': 0.5983604561999367, 'gamma': 4.888841211719245, 'reg_alpha': 0.6058620041619708, 'reg_lambda': 1.034914093155283}. Best is trial 0 with value: 0.6196955970843745.
[I 2024-12-09 02:06:41,805] Trial 1 finished with value: 0.6259821253570839 and parameters: {'max_depth': 3, 'learning_rate': 0.10265931528356925, 'n_estimators': 273, 'colsample_bytree': 0.8584053220108572, 'subsample': 0.5010387385684734, 'gamma': 3.5030851436208383, 'reg_alpha': 0.4627313006483803, 'reg_lambda': 1.891565627469794}. Best is trial 1 with value: 0.6259821253570839.
[I 2024-12-09 02:06:42,128] Trial 2 finished with value: 0.6402562721497466 and parameters: {'max_depth': 6, 'learning

Beste Parameter: {'max_depth': 6, 'learning_rate': 0.14751782232750854, 'n_estimators': 275, 'colsample_bytree': 0.7766488973382092, 'subsample': 0.9516562498035502, 'gamma': 0.03367330199720531, 'reg_alpha': 0.5167403897206998, 'reg_lambda': 1.4729346808859045}


In [12]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Konfiguration aus RandomizedSearchCV
model_random_search = XGBClassifier(
    subsample=0.8,
    reg_lambda=2,
    reg_alpha=0,
    n_estimators=300,
    max_depth=7,
    learning_rate=0.2,
    gamma=1,
    colsample_bytree=0.7,
    random_state=42
)

# Konfiguration aus Optuna
model_optuna = XGBClassifier(
    max_depth=6,
    learning_rate=0.14751782232750854,
    n_estimators=275,
    colsample_bytree=0.7766488973382092,
    subsample=0.9516562498035502,
    gamma=0.03367330199720531,
    reg_alpha=0.5167403897206998,
    reg_lambda=1.4729346808859045,
    random_state=42
)

# Training und Bewertung des RandomizedSearchCV-Modells
model_random_search.fit(X_train, y_train)
y_pred_random_search = model_random_search.predict(X_test)
print("RandomizedSearchCV-Modell:")
print(classification_report(y_test, y_pred_random_search))
print(f"Genauigkeit: {accuracy_score(y_test, y_pred_random_search):.2f}")

# Training und Bewertung des Optuna-Modells
model_optuna.fit(X_train, y_train)
y_pred_optuna = model_optuna.predict(X_test)
print("\nOptuna-Modell:")
print(classification_report(y_test, y_pred_optuna))
print(f"Genauigkeit: {accuracy_score(y_test, y_pred_optuna):.2f}")


RandomizedSearchCV-Modell:
              precision    recall  f1-score   support

           0       0.65      0.66      0.66      1002
           1       0.62      0.61      0.62       908

    accuracy                           0.64      1910
   macro avg       0.64      0.64      0.64      1910
weighted avg       0.64      0.64      0.64      1910

Genauigkeit: 0.64

Optuna-Modell:
              precision    recall  f1-score   support

           0       0.67      0.66      0.67      1002
           1       0.63      0.64      0.63       908

    accuracy                           0.65      1910
   macro avg       0.65      0.65      0.65      1910
weighted avg       0.65      0.65      0.65      1910

Genauigkeit: 0.65


In [14]:
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Initiales Modell für Feature-Importance
initial_model = XGBClassifier(
    max_depth=6,
    learning_rate=0.14751782232750854,
    n_estimators=275,
    colsample_bytree=0.7766488973382092,
    subsample=0.9516562498035502,
    gamma=0.03367330199720531,
    reg_alpha=0.5167403897206998,
    reg_lambda=1.4729346808859045,
    random_state=42
)
initial_model.fit(X_train, y_train)

# 2. Feature-Importance anzeigen
importances = initial_model.feature_importances_
print("Feature-Importances:")
for i, col in enumerate(X_train.columns):
    print(f"{col}: {importances[i]}")

# 3. Feature-Selection basierend auf Importance
selector = SelectFromModel(initial_model, threshold="mean", prefit=True)  # Features oberhalb des Mittelwerts
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

print(f"Reduzierte Feature-Matrix nach Selektion: {X_train_selected.shape}")

# 4. SMOTE anwenden
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_selected, y_train)

print(f"Trainingsdaten nach SMOTE: {X_train_resampled.shape}")

# 5. Modell trainieren mit ausgewählten Features und resampleten Daten
final_model = XGBClassifier(
    max_depth=6,
    learning_rate=0.14751782232750854,
    n_estimators=275,
    colsample_bytree=0.7766488973382092,
    subsample=0.9516562498035502,
    gamma=0.03367330199720531,
    reg_alpha=0.5167403897206998,
    reg_lambda=1.4729346808859045,
    random_state=42
)
final_model.fit(X_train_resampled, y_train_resampled)

# 6. Vorhersagen und Bewertung
y_pred_final = final_model.predict(X_test_selected)
print("Klassifikationsbericht (Finales Modell):")
print(classification_report(y_test, y_pred_final))
print(f"Genauigkeit: {accuracy_score(y_test, y_pred_final):.2f}")


Feature-Importances:
Student ID: 0.010249266400933266
Student Country: 0.010869559831917286
Question ID: 0.005923223681747913
Question Level: 0.010677258484065533
Topic: 0.01424817182123661
 and inequalities: 0.015386383980512619
 and total probability rules: 0.0032536815851926804
 equations: 0.006425110157579184
 image and graphics: 0.003429036121815443
 integration by parts: 0.0
 multiplication: 0.004050378687679768
addition: 0.0025374596007168293
adjacency matrix: 0.0037392766680568457
algebraic expressions: 0.0
algebraic form: 0.004006892908364534
analytic geometry: 0.00290168565697968
area : 0.0010339259169995785
area of a planar region : 0.0
assignment problem: 0.0
axioms of probability: 0.003928063903003931
basis: 0.007506234105676413
bayes' theorem: 0.0
bernoulli equation: 0.0
cartesian coordinates: 0.0036970977671444416
cartesian equations of a line: 0.0038038957864046097
cartesian equations of a plane: 0.008650450967252254
cauchy problem: 0.0
chain rule: 0.0
change-of-basis m



Klassifikationsbericht (Finales Modell):
              precision    recall  f1-score   support

           0       0.66      0.62      0.64      1002
           1       0.61      0.65      0.63       908

    accuracy                           0.64      1910
   macro avg       0.64      0.64      0.64      1910
weighted avg       0.64      0.64      0.64      1910

Genauigkeit: 0.64


In [15]:
from sklearn.decomposition import PCA

# PCA anwenden, um die Dimensionen zu reduzieren
pca = PCA(n_components=20)  # Anzahl der Hauptkomponenten anpassen
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Neues Modell trainieren
model_pca = XGBClassifier(random_state=42)
model_pca.fit(X_train_pca, y_train)
y_pred_pca = model_pca.predict(X_test_pca)

print("Klassifikationsbericht (PCA):")
print(classification_report(y_test, y_pred_pca))


Klassifikationsbericht (PCA):
              precision    recall  f1-score   support

           0       0.63      0.63      0.63      1002
           1       0.59      0.59      0.59       908

    accuracy                           0.61      1910
   macro avg       0.61      0.61      0.61      1910
weighted avg       0.61      0.61      0.61      1910

