In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Load the dataset
df = pd.read_csv('performance_prediction_dataset.csv')

In [12]:
# Split the data into features (X) and target/label (y)
X = df.drop(columns=['Label'])  # Features: Cognitive, Interpersonal, Verbal, Analytical
y = df['Label']  # Target: Label column

In [13]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.27, random_state=42)


In [14]:

# 1. Feature Selection using Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=41)
rf.fit(X_train, y_train)

# Get feature importances from Random Forest
importances = rf.feature_importances_

# Sort feature importances and get corresponding column names
sorted_indices = np.argsort(importances)[::-1]
top_features = X.columns[sorted_indices]

# Use the same feature order for both training and test sets
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

accuracy_rf=rf.score(X_test, y_test)

# Ensure the column names are consistent
assert list(X_train_selected.columns) == list(X_test_selected.columns), "Feature columns do not match!"

# 2. Train SVM on selected features
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_selected, y_train)

# Predict on the test set using SVM
y_pred_svm = svm.predict(X_test_selected)

# Calculate accuracy for SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")

# Print classification report for SVM
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

# 3. Train KNN on selected features
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_selected, y_train)

# Predict on the test set using KNN
y_pred_knn = knn.predict(X_test_selected)

# Calculate accuracy for KNN
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {accuracy_knn * 100:.2f}%")

# Print classification report for KNN
print("\nKNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

# 4. Train Decision Tree with max_depth to prevent overfitting
dt = DecisionTreeClassifier(random_state=42, max_depth=5)  # Limiting tree depth
dt.fit(X_train_selected, y_train)

# Predict on the test set using Decision Tree
y_pred_dt = dt.predict(X_test_selected)

# Calculate accuracy for Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt * 100:.2f}%")

# Print classification report for Decision Tree
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Save all models
joblib.dump(rf, 'random_forest_model.pkl')
joblib.dump(svm, 'svm_model.pkl')
joblib.dump(knn, 'knn_model.pkl')
joblib.dump(dt, 'decision_tree_model.pkl')
print("\nModels saved successfully as 'random_forest_model.pkl', 'svm_model.pkl', 'knn_model.pkl', and 'decision_tree_model.pkl'")

# Print accuracy comparison
print("\nAccuracy Comparison:")
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"SVM Accuracy: {accuracy_svm * 100:.2f}%")
print(f"KNN Accuracy: {accuracy_knn * 100:.2f}%")
print(f"Decision Tree Accuracy: {accuracy_dt * 100:.2f}%")

SVM Accuracy: 99.63%

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00        74
           3       1.00      0.67      0.80         3
           4       1.00      1.00      1.00        35
           5       1.00      1.00      1.00        20

    accuracy                           1.00       270
   macro avg       0.98      0.94      0.96       270
weighted avg       1.00      1.00      1.00       270

KNN Accuracy: 96.67%

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.96      0.98      0.97       130
           2       0.99      0.95      0.97        74
           3       1.00      0.67      0.80         3
           4       0.95      1.00      0.97        35
           5       1.00      0.90 