In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Laden des verarbeiteten Datensatzes
df = pd.read_csv('processed_dataset.csv')
print("Daten erfolgreich geladen:")
print(df.head())

Daten erfolgreich geladen:
   Student ID  Student Country  Question ID  Type of Answer  Question Level  \
0         647                0           77               0               1   
1          41                3           77               1               1   
2         340                3           77               1               1   
3         641                1           77               0               1   
4         669                3           77               1               1   

   Topic   and inequalities   and total probability rules   equations  \
0     13                  0                             0           0   
1     13                  0                             0           0   
2     13                  0                             0           0   
3     13                  0                             0           0   
4     13                  0                             0           0   

    image and graphics  ...  trigonometric substitution  \


In [2]:
# Zielvariable und Features trennen
# Beispiel: Annahme, dass die Zielvariable in der letzten Spalte ist
y = df['Type of Answer']  # Passen Sie den Namen an, falls erforderlich
X = df.drop(columns=['Type of Answer'])  # Entfernen der Zielvariable aus den Features

print("Features und Zielvariable definiert:")
print(f"Features: {X.columns.tolist()}")
print(f"Zielvariable: {y.name}")


Features und Zielvariable definiert:
Features: ['Student ID', 'Student Country', 'Question ID', 'Question Level', 'Topic', ' and inequalities', ' and total probability rules', ' equations', ' image and graphics', ' integration by parts', ' multiplication', 'addition', 'adjacency matrix', 'algebraic expressions', 'algebraic form', 'analytic geometry', 'area ', 'area of a planar region ', 'assignment problem', 'axioms of probability', 'basis', "bayes' theorem", 'bernoulli equation', 'cartesian coordinates', 'cartesian equations of a line', 'cartesian equations of a plane', 'cauchy problem', 'chain rule', 'change-of-basis matrix', 'changing order of integration', 'characteristic polynomial', 'chi square distribution', 'chromatic number', 'classification of geometric solids', 'classification of geometrical figures', 'collinearity', 'commuting matrices', 'complement of a set', 'complex numbers', 'complex plane ', 'composition of linear applications', 'conditional probability', 'confidence i

In [3]:
# Aufteilen der Daten in Training und Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")


Trainingsdaten: (7636, 225), Testdaten: (1910, 225)


In [4]:
# Logistische Regression initialisieren und trainieren
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print("Modell erfolgreich trainiert.")


Modell erfolgreich trainiert.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
# Vorhersagen auf Testdaten
y_pred = model.predict(X_test)

# Genauigkeit und Bericht
print("Modellbewertung:")
print(f"Genauigkeit: {accuracy_score(y_test, y_pred):.2f}")
print("\nKlassifikationsbericht:")
print(classification_report(y_test, y_pred))


Modellbewertung:
Genauigkeit: 0.57

Klassifikationsbericht:
              precision    recall  f1-score   support

           0       0.58      0.68      0.62      1002
           1       0.56      0.45      0.50       908

    accuracy                           0.57      1910
   macro avg       0.57      0.56      0.56      1910
weighted avg       0.57      0.57      0.56      1910



In [7]:
from imblearn.over_sampling import SMOTE

# SMOTE anwenden
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print(f"Trainingsdaten vor SMOTE: {X_train.shape}, nach SMOTE: {X_resampled.shape}")

# Modell erneut trainieren
model.fit(X_resampled, y_resampled)
y_pred_resampled = model.predict(X_test)

# Neue Bewertung
from sklearn.metrics import classification_report
print("Nach Anwendung von SMOTE:")
print(classification_report(y_test, y_pred_resampled))


Trainingsdaten vor SMOTE: (7636, 225), nach SMOTE: (8148, 225)
Nach Anwendung von SMOTE:
              precision    recall  f1-score   support

           0       0.57      0.55      0.56      1002
           1       0.52      0.54      0.53       908

    accuracy                           0.54      1910
   macro avg       0.54      0.54      0.54      1910
weighted avg       0.54      0.54      0.54      1910



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Modell mit Gewichtung trainieren
model_weighted = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model_weighted.fit(X_train, y_train)

# Bewertung
y_pred_weighted = model_weighted.predict(X_test)
print("Modell mit gewichteten Klassen:")
print(classification_report(y_test, y_pred_weighted))


Modell mit gewichteten Klassen:
              precision    recall  f1-score   support

           0       0.58      0.55      0.56      1002
           1       0.53      0.56      0.54       908

    accuracy                           0.55      1910
   macro avg       0.55      0.55      0.55      1910
weighted avg       0.55      0.55      0.55      1910



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modell erneut trainieren
model.fit(X_train_scaled, y_train)
y_pred_scaled = model.predict(X_test_scaled)

print("Nach Skalierung:")
print(classification_report(y_test, y_pred_scaled))


Nach Skalierung:
              precision    recall  f1-score   support

           0       0.57      0.68      0.62      1002
           1       0.56      0.44      0.49       908

    accuracy                           0.57      1910
   macro avg       0.57      0.56      0.56      1910
weighted avg       0.57      0.57      0.56      1910

