In [1]:
import pandas as pd

data = pd.read_csv("embedded_dataset_electra.csv")
low = data[data["label"] == 1]
med = data[data["label"] == 2]
high = data[data["label"] == 3]

print(len(low))
print(len(med))
print(len(high))

452
1131
1728


In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE


# Drop non-numeric columns (like 'Student' and 'Teacher')
data=data.dropna(subset=['label'] + data.columns[:768].tolist())
X = data

# Convert column names to strings
X.columns = X.columns.astype(str)

# Target column
y = data['label']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Create DataFrame from resampled data
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['Label'])

# Combine into one DataFrame
balanced_df = pd.concat([X_resampled_df, y_resampled_df], axis=1)

# Save to Excel
balanced_df.to_excel("final_student_embeddings_upsampled.xlsx", index=False)
print("Upsampled dataset saved successfully.")

Upsampled dataset saved successfully.


In [3]:
import pandas as pd

data = pd.read_excel("final_student_embeddings_upsampled.xlsx")
low = data[data["label"] == 1]
med = data[data["label"] == 2]
high = data[data["label"] == 3]

print(len(low))
print(len(med))
print(len(high))

1728
1728
1728


In [4]:
pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[91m‚ï∏[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m204.8/275.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m275.7/275.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=41c759c8f69f95109b000f3825352ca699cd3dbb54dbacbc4bee33654f0d27d3
  Stored in directo

In [7]:
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import pandas as pd
import numpy as np

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb_available = True
except ImportError:
    xgb_available = False

# Load updated dataset
df = pd.read_excel("final_student_embeddings_upsampled.xlsx")

# Update this column name if needed
target_column = "label"

# Features and Labels
X = df.iloc[:, :768].values
y = df[target_column]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# NCA
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
X_nca = nca.fit_transform(X, y_encoded)

# Analyze top contributing original features to NCA components
nca_components = nca.components_  # Shape: (2, 768)
top_k = 10  # Top N contributing features to show

for i in range(2):
    component = nca_components[i]
    top_indices = np.argsort(np.abs(component))[::-1][:top_k]
    print(f"\nüîç Top {top_k} original features contributing to NCA Component {i + 1}:")
    for idx in top_indices:
        print(f"Feature {idx}: Weight = {component[idx]:.4f}")

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_nca)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Define parameter grids for hyperparameter tuning
param_grids = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear']
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5]
    },
    "Naive Bayes": {},
    "Decision Tree": {
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5]
    },
    "AdaBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },
    "MLP": {
        'hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'activation': ['relu', 'tanh'],
        'learning_rate': ['constant', 'adaptive']
    },
    "KNN": {
        'n_neighbors': list(range(1, 21)),
        'weights': ['uniform', 'distance']
    }
}

if xgb_available:
    param_grids["XGBoost"] = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3]
    }

# Base models
base_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "MLP": MLPClassifier(max_iter=500, random_state=42),
    "KNN": KNeighborsClassifier()
}

if xgb_available:
    base_models["XGBoost"] = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Hyperparameter tuning
best_models = {}
for name, model in base_models.items():
    if param_grids[name]:
        print(f"Tuning {name}...")
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            cv=5,
            scoring='accuracy',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        best_models[name] = grid_search.best_estimator_
        print(f"Best parameters for {name}: {grid_search.best_params_}")
    else:
        model.fit(X_train, y_train)
        best_models[name] = model
        print(f"No tuning required for {name}")

# Stacking classifier
stacking_estimators = [(name.lower().replace(" ", "_"), model) for name, model in best_models.items()]
stacking_clf = StackingClassifier(
    estimators=stacking_estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5
)

# Add stacking
models = best_models.copy()
models["Stacking (All Models)"] = stacking_clf

# Evaluate
for name, model in models.items():
    print(f" {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    train_acc = accuracy_score(y_train, model.predict(X_train))
    test_acc = accuracy_score(y_test, y_pred)

    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Testing Accuracy:  {test_acc:.4f}")
    print("Classification Report:")
    target_names = [str(label) for label in label_encoder.classes_]
    print(classification_report(y_test, y_pred, target_names=target_names))
    print("-" * 60)


from lime.lime_tabular import LimeTabularExplainer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt  # <-- Add this import

# Prepare the test set in original feature space
_, X_test_original, _, y_test_original = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)
instance_original = X_test_original[0]

# Set up LIME explainer for original features
explainer = LimeTabularExplainer(
    training_data=X,
    feature_names=[f"emb_{i}" for i in range(X.shape[1])],
    class_names=[str(label) for label in label_encoder.classes_],
    mode='classification',
    random_state=42,
    discretize_continuous=False
)

print("\nLIME Explanations for Test Instance (First Sample, original features):")
print(f"Instance values: {instance_original}")
print("Predicted class for each model:")

for name, model in models.items():
    # Skip stacking for LIME unless you want advanced handling
    if name == "Stacking (All Models)":
        print(f"{name}: Skipped LIME explanation (complex pipeline)")
        continue

    # Build pipeline: StandardScaler -> NCA -> Model (for LIME only)
    lime_pipeline = make_pipeline(
        StandardScaler(),
        NeighborhoodComponentsAnalysis(n_components=2, random_state=42),
        model
    )
    # Fit pipeline on ALL original data (NCA is supervised)
    lime_pipeline.fit(X, y_encoded)

    pred = lime_pipeline.predict([instance_original])[0]
    print(f"{name}: {label_encoder.inverse_transform([pred])[0]}")

    print(f"\nExplaining predictions for {name}...")
    try:
        explanation = explainer.explain_instance(
            data_row=instance_original,
            predict_fn=lime_pipeline.predict_proba,
            num_features=10,  # Show top 10 original features
            num_samples=5000
        )
        # Save as HTML
        explanation.save_to_file(f"lime_explanation_{name.replace(' ', '_')}_origfeatures.html")
        # Save as image
        fig = explanation.as_pyplot_figure()
        plt.tight_layout()
        fig.savefig(f"lime_explanation_{name.replace(' ', '_')}_origfeatures.png")
        plt.close(fig)
    except Exception as e:
        print(f"Failed to generate explanation for {name}: {str(e)}")
    print("-" * 60)





üîç Top 10 original features contributing to NCA Component 1:
Feature 241: Weight = -10.0830
Feature 746: Weight = 8.0443
Feature 331: Weight = 7.7800
Feature 270: Weight = 7.5209
Feature 682: Weight = -7.3500
Feature 530: Weight = -5.0457
Feature 183: Weight = -4.7885
Feature 461: Weight = 4.6059
Feature 469: Weight = 4.5972
Feature 101: Weight = 4.3144

üîç Top 10 original features contributing to NCA Component 2:
Feature 270: Weight = -14.6004
Feature 720: Weight = 13.5697
Feature 309: Weight = 6.8455
Feature 341: Weight = 5.6401
Feature 408: Weight = -5.2257
Feature 340: Weight = -4.8985
Feature 688: Weight = 4.5241
Feature 331: Weight = -4.3955
Feature 631: Weight = 4.1551
Feature 143: Weight = -4.1353
Tuning Logistic Regression...
Best parameters for Logistic Regression: {'C': 0.01, 'solver': 'lbfgs'}
Tuning SVM...
Best parameters for SVM: {'C': 10, 'kernel': 'rbf'}
Tuning Random Forest...
Best parameters for Random Forest: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_

Parameters: { "use_label_encoder" } are not used.



Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
 Logistic Regression
Training Accuracy: 0.7982
Testing Accuracy:  0.7994
Classification Report:
              precision    recall  f1-score   support

           1       0.81      0.98      0.89       331
           2       0.78      0.76      0.77       344
           3       0.80      0.68      0.73       362

    accuracy                           0.80      1037
   macro avg       0.80      0.80      0.80      1037
weighted avg       0.80      0.80      0.79      1037

------------------------------------------------------------
 SVM
Training Accuracy: 0.8286
Testing Accuracy:  0.8293
Classification Report:
              precision    recall  f1-score   support

           1       0.88      0.97      0.93       331
           2       0.81      0.76      0.79       344
           3       0.79      0.77      0.78       362

    accuracy                           0.83      1037
   macro avg       0.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Training Accuracy: 0.8367
Testing Accuracy:  0.8293
Classification Report:
              precision    recall  f1-score   support

           1       0.89      0.96      0.93       331
           2       0.81      0.77      0.79       344
           3       0.79      0.76      0.77       362

    accuracy                           0.83      1037
   macro avg       0.83      0.83      0.83      1037
weighted avg       0.83      0.83      0.83      1037

------------------------------------------------------------

LIME Explanations for Test Instance (First Sample, original features):
Instance values: [ 2.12890029e-01 -1.34479682e-01 -1.32764606e-01 -2.02796557e-01
  3.39168891e-01  9.58515781e-01  7.26972669e-01 -2.66439228e-01
 -7.94369113e-02  1.91280595e-01  5.00898907e-01  2.97939238e-01
  6.35831203e-01 -9.05370749e-02  1.79786143e-01 -1.03639507e-01
 -8.07236202e-02  5.79284901e-01 -2.47611227e-01 -2.53925574e-01
 -2.52160855e-01 -4.44903599e-01 -7.00799172e-01 -3.93962820e-01
  7.

Parameters: { "use_label_encoder" } are not used.



XGBoost: 2

Explaining predictions for XGBoost...
------------------------------------------------------------
Stacking (All Models): Skipped LIME explanation (complex pipeline)
