 Use cross-validation techniques (RandomizedSearchCV()) technique to tune the hyperparameters for your models. 

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# Load Data
file_path = "student_embeddings.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Clean Data
df_cleaned = df.drop(columns=['Student'])  # Drop 'Student' column
target_col = df_cleaned.columns[-1]  # Last column as target

# Handle NaN values in target column
df_cleaned[target_col] = pd.to_numeric(df_cleaned[target_col], errors='coerce')
df_cleaned = df_cleaned.dropna(subset=[target_col])

# Convert target column to integer
df_cleaned[target_col] = df_cleaned[target_col].astype(int)

# Define Features (X) and Target (y)
X = df_cleaned.drop(columns=[target_col])
y = df_cleaned[target_col]

# Normalize target variable (to start from 0)
y -= y.min()

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define Models & Hyperparameters
models = {
    "SVM": (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    "Decision Tree": (DecisionTreeClassifier(), {'max_depth': [3, 5, 10]}),
    "Random Forest": (RandomForestClassifier(), {'n_estimators': [50, 100, 200]}),
    "AdaBoost": (AdaBoostClassifier(), {'n_estimators': [50, 100, 200]}),
    "XGBoost": (XGBClassifier(eval_metric='mlogloss'), {'n_estimators': [50, 100, 200]}),
    "Naive Bayes": (GaussianNB(), {}),  # No hyperparameters to tune
    "MLP": (MLPClassifier(max_iter=500), {'hidden_layer_sizes': [(50,), (100,)], 'alpha': [0.0001, 0.001]})
}

# Hyperparameter Tuning using RandomizedSearchCV
best_models = {}
for name, (model, param_grid) in models.items():
    if not param_grid:
        best_models[name] = model
        print(f"Skipping hyperparameter tuning for {name} (no parameters to tune).")
        continue

    n_iter = min(5, len(list(param_grid.values())[0]))  # Limit search iterations

    clf = RandomizedSearchCV(model, param_grid, cv=3, n_iter=n_iter, scoring='accuracy', random_state=42, n_jobs=-1)
    clf.fit(X_train_scaled, y_train)
    
    best_models[name] = clf.best_estimator_
    print(f"Best parameters for {name}: {clf.best_params_}")


Best parameters for SVM: {'kernel': 'linear', 'C': 0.1}
Best parameters for Decision Tree: {'max_depth': 10}
Best parameters for Random Forest: {'n_estimators': 50}




Best parameters for AdaBoost: {'n_estimators': 50}
Best parameters for XGBoost: {'n_estimators': 50}
Skipping hyperparameter tuning for Naive Bayes (no parameters to tune).
Best parameters for MLP: {'hidden_layer_sizes': (100,), 'alpha': 0.001}


For projects dealing with classification problem, employ various other classifiers such as Support Vector Machines, Decision Tree, RandomForest, CatBoost, AdaBoost, XGBoost, Naïve-Bayes & MLP. Tabulate your results for your problem using different performance metrics. Your tabulated results should compare between train and test results and make appropriate observations.

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
file_path = "student_embeddings.xlsx"
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Data Preprocessing
df_cleaned = df.drop(columns=['Student'], errors='ignore')  # Drop non-numeric column if exists

target_column = df_cleaned.columns[-1]  # Identify last column as target

# Convert target column to numeric and drop invalid rows
df_cleaned[target_column] = pd.to_numeric(df_cleaned[target_column], errors='coerce')
df_cleaned = df_cleaned.dropna(subset=[target_column])

# Define Features & Target
X = df_cleaned.drop(columns=[target_column])
y = df_cleaned[target_column].astype(int)

# Normalize target variable
y -= y.min()

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define Models
models = {
    "SVM": SVC(kernel='rbf', C=1),
    "Decision Tree": DecisionTreeClassifier(max_depth=5),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "AdaBoost": AdaBoostClassifier(n_estimators=50),
    "XGBoost": XGBClassifier(eval_metric='mlogloss', n_estimators=100),
    "Naive Bayes": GaussianNB(),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), alpha=0.001, max_iter=500)
}

# Train Models & Evaluate
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=1)
    recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=1)

    results.append([name, train_acc, test_acc, precision, recall, f1])
    print(f"\n{name} Classification Report (Test Data):")
    print(classification_report(y_test, y_test_pred, zero_division=1))

# Create Performance Summary DataFrame
results_df = pd.DataFrame(results, columns=["Model", "Train Accuracy", "Test Accuracy", "Precision", "Recall", "F1 Score"])
print("\nModel Performance Comparison:")
print(results_df)



SVM Classification Report (Test Data):
              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.50      1.00      0.67         3
           2       1.00      0.00      0.00         2

    accuracy                           0.50         6
   macro avg       0.83      0.33      0.22         6
weighted avg       0.75      0.50      0.33         6


Decision Tree Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       1.00      0.67      0.80         3
           2       1.00      0.50      0.67         2

    accuracy                           0.67         6
   macro avg       0.78      0.72      0.66         6
weighted avg       0.89      0.67      0.71         6


Random Forest Classification Report (Test Data):
              precision    recall  f1-score   support

           0       1.00      0.00      0.00




AdaBoost Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.33      0.33      0.33         3
           2       0.00      0.00      0.00         2

    accuracy                           0.17         6
   macro avg       0.11      0.11      0.11         6
weighted avg       0.17      0.17      0.17         6


XGBoost Classification Report (Test Data):
              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.60      1.00      0.75         3
           2       1.00      0.50      0.67         2

    accuracy                           0.67         6
   macro avg       0.87      0.50      0.47         6
weighted avg       0.80      0.67      0.60         6


Naive Bayes Classification Report (Test Data):
              precision    recall  f1-score   support

           0       1.00      0.00      0.00   