In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Datensatz laden
file_path = "processed_dataset.csv"  # Pfad zu Ihrem gespeicherten Datensatz
df = pd.read_csv(file_path)

# 2. Features und Zielvariable definieren
# Annahme: Die Zielvariable heißt "target". Passen Sie dies an Ihre Zielspalte an.
X = df.drop(columns=['Type of Answer'])  # Unabhängige Variablen
y = df['Type of Answer']                # Zielvariable

# 3. Trainings- und Testdaten aufteilen
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. LightGBM-Classifier trainieren
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train, y_train)

# 5. Vorhersagen treffen
y_pred = lgbm_model.predict(X_test)

# 6. Modellbewertung
accuracy = accuracy_score(y_test, y_pred)
print(f"Genauigkeit: {accuracy:.2f}")

# Detaillierter Bericht
print("\nKlassifikationsbericht:")
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 3562, number of negative: 4074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 824
[LightGBM] [Info] Number of data points in the train set: 7636, number of used features: 161
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466475 -> initscore=-0.134303
[LightGBM] [Info] Start training from score -0.134303
Genauigkeit: 0.65

Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.66      0.67      0.66      1002
           1       0.63      0.62      0.62       908

    accuracy                           0.65      1910
   macro avg       0.64      0.64      0.64      1910
weighted avg       0.64      0.65      0.64      1910



In [2]:
# Entfernen von Whitespaces in den Feature-Namen
X.columns = [col.replace(' ', '_') for col in X.columns]


In [3]:
from lightgbm import LGBMClassifier

# Modell mit ausgeglichenem Klassengewicht
lgbm_model = LGBMClassifier(class_weight='balanced', random_state=42)
lgbm_model.fit(X_train, y_train)

# Bewertung
y_pred = lgbm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Genauigkeit mit Klassengewicht: {accuracy:.2f}")


[LightGBM] [Info] Number of positive: 3562, number of negative: 4074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 824
[LightGBM] [Info] Number of data points in the train set: 7636, number of used features: 161
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Genauigkeit mit Klassengewicht: 0.65


In [4]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

# Parameter für GridSearch
params = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 500],
    'min_child_samples': [20, 50, 100]
}

# GridSearchCV anwenden
grid = GridSearchCV(LGBMClassifier(random_state=42), params, cv=3, scoring='accuracy')
grid.fit(X_train, y_train)

print(f"Beste Parameter: {grid.best_params_}")

# Modell mit besten Parametern
best_model = grid.best_estimator_

# Bewertung
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Genauigkeit nach Hyperparameter-Tuning: {accuracy:.2f}")


[LightGBM] [Info] Number of positive: 2374, number of negative: 2716
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 783
[LightGBM] [Info] Number of data points in the train set: 5090, number of used features: 142
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466405 -> initscore=-0.134584
[LightGBM] [Info] Start training from score -0.134584
[LightGBM] [Info] Number of positive: 2375, number of negative: 2716
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 786
[LightGBM] [Info] Number of data points in the train set: 5091, number of used features: 146
[LightGBM] [Info] [binar

In [5]:
# Feature-Importance
import pandas as pd

feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': lgbm_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance.head(50))


                                Feature  Importance
0                            Student ID        1281
2                           Question ID         667
1                       Student Country         165
4                                 Topic          87
3                        Question Level          66
5                      and inequalities          27
220                       vector spaces          25
107                              limits          21
119              linear transformations          20
116                  linear programming          20
97               integration techniques          18
101          invertible linear operator          17
204                            subspace          17
213                  trigonometric form          16
124                    logarithmic rule          16
56                            dimension          15
44                    consistent system          15
109                  linear application          15
143     oper

In [16]:
# Schwellenwert für Importance setzen
threshold = 8

# Features mit Importance >= threshold auswählen
important_features = feature_importance[feature_importance['Importance'] >= threshold]['Feature']

# Reduzierter Datensatz mit wichtigen Features
X_train_reduced = X_train[important_features]
X_test_reduced = X_test[important_features]

# Modell trainieren und bewerten
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train_reduced, y_train)

# Vorhersagen und Bewertung
y_pred = lgbm_model.predict(X_test_reduced)
accuracy = accuracy_score(y_test, y_pred)
print(f"Genauigkeit mit reduzierten Features: {accuracy:.2f}")


[LightGBM] [Info] Number of positive: 3562, number of negative: 4074
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 591
[LightGBM] [Info] Number of data points in the train set: 7636, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.466475 -> initscore=-0.134303
[LightGBM] [Info] Start training from score -0.134303
Genauigkeit mit reduzierten Features: 0.65


In [7]:
# Korrelation der wichtigen Features mit der Zielvariable
correlation_with_target = df[['Student ID', 'Question ID', 'Type of Answer']].corr()
print(correlation_with_target)


                Student ID  Question ID  Type of Answer
Student ID        1.000000     0.033439       -0.030064
Question ID       0.033439     1.000000       -0.013683
Type of Answer   -0.030064    -0.013683        1.000000


In [8]:
from sklearn.model_selection import cross_val_score

# Cross-Validation
scores = cross_val_score(LGBMClassifier(random_state=42), X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation-Genauigkeiten: {scores}")
print(f"Durchschnittliche Genauigkeit: {scores.mean():.2f}")


[LightGBM] [Info] Number of positive: 3576, number of negative: 4060
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 785
[LightGBM] [Info] Number of data points in the train set: 7636, number of used features: 148
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.468308 -> initscore=-0.126938
[LightGBM] [Info] Start training from score -0.126938
[LightGBM] [Info] Number of positive: 3576, number of negative: 4061
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 800
[LightGBM] [Info] Number of data points in the train set: 7637, number of used features: 149
[LightGBM] [Info] [binar