In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, accuracy_score

# -------------------------------
# Data Loading and Preparation
# -------------------------------
file_path = r'C:\Users\emexi\Desktop\ΔΙΚΑ ΜΟΥ\Aviation_Project\Preprocessed_file.csv'
data = pd.read_csv(file_path, encoding='latin1')

# Define target columns and category grouping
target_columns = [
    "SA_1_understanding_job_roles",
    "SA_2_understanding_job_roles",
    "SA_3_understanding_job_roles",
    "SA_4_understanding_job_roles",
    "SA_5_understanding_job_roles",
    "SA_6_teamwork_in_emergencies",
    "SA_7_teamwork_in_emergencies",
    "SA_8_teamwork_in_emergencies",
    "SA_9_teamwork_in_emergencies",
    "SA_10_teamwork_in_emergencies",
    "SA_11_teamwork_and_communication",
    "SA_12_teamwork_and_communication",
    "SA_13_teamwork_and_communication",
    "SA_14_teamwork_and_communication",
    "SA_15_teamwork_and_communication",
    "SA_16R_teamwork_and_communication",
    "SA_17_teamwork_and_communication",
    "SA_18_teamwork_and_communication",
    "SA_19_teamwork_and_communication",
    "SA_20_teamwork_and_communication",
    "SA_21_teamwork_and_communication",
    "SA_22R_teamwork_and_communication",
    "SA_23_teamwork_and_communication",
    "SA_24_teamwork_and_communication",
    "SA_25_teamwork_and_communication",
    "SA_26_clear_communication_within_team",
    "SA_27_clear_communication_within_team",
    "SA_28_clear_communication_within_team",
    "SA_29_clear_communication_within_team",
    "SA_30_clear_communication_within_team",
    "SA_31_clear_communication_within_team",
    "SA_32_clear_communication_within_team",
    "SA_33_clear_communication_within_team",
    "SA_34R_clear_communication_within_team"
]

categories = {
    "Understanding of job roles": [
        "SA_1_understanding_job_roles",
        "SA_2_understanding_job_roles",
        "SA_3_understanding_job_roles",
        "SA_4_understanding_job_roles",
        "SA_5_understanding_job_roles"
    ],
    "Teamwork in emergencies": [
        "SA_6_teamwork_in_emergencies",
        "SA_7_teamwork_in_emergencies",
        "SA_8_teamwork_in_emergencies",
        "SA_9_teamwork_in_emergencies",
        "SA_10_teamwork_in_emergencies"
    ],
    "Overall importance of effective teamwork and communication": [
        "SA_11_teamwork_and_communication",
        "SA_12_teamwork_and_communication",
        "SA_13_teamwork_and_communication",
        "SA_14_teamwork_and_communication",
        "SA_15_teamwork_and_communication",
        "SA_16R_teamwork_and_communication",
        "SA_17_teamwork_and_communication",
        "SA_18_teamwork_and_communication",
        "SA_19_teamwork_and_communication",
        "SA_20_teamwork_and_communication",
        "SA_21_teamwork_and_communication",
        "SA_22R_teamwork_and_communication",
        "SA_23_teamwork_and_communication",
        "SA_24_teamwork_and_communication",
        "SA_25_teamwork_and_communication"
    ],
    "Importance of clear communication within the maintenance team": [
        "SA_26_clear_communication_within_team",
        "SA_27_clear_communication_within_team",
        "SA_28_clear_communication_within_team",
        "SA_29_clear_communication_within_team",
        "SA_30_clear_communication_within_team",
        "SA_31_clear_communication_within_team",
        "SA_32_clear_communication_within_team",
        "SA_33_clear_communication_within_team",
        "SA_34R_clear_communication_within_team"
    ]
}

# Separate input features and targets
input_features = data.drop(columns=target_columns)
targets = data[target_columns]

# Scale the input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(input_features)

# --------------------------------------------
# Function: Feature Importance using LinearSVC (SVM)
# --------------------------------------------
def get_important_features_svm(input_features, targets, num_features):
    feature_importances = {}
    # Compute importance for each target individually using LinearSVC
    for target in targets.columns:
        # Temporary split for stability
        X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(
            X_scaled, targets[target], test_size=0.2, random_state=42
        )
        # Use LinearSVC to obtain coefficients as a proxy for feature importance
        lsvc = LinearSVC(random_state=42, dual=False, max_iter=10000)
        lsvc.fit(X_train_tmp, y_train_tmp)
        # For binary classification, lsvc.coef_ is shape (1, n_features).
        # Use the absolute values of coefficients and average across classes if needed.
        importance = np.mean(np.abs(lsvc.coef_), axis=0)
        feature_importances[target] = importance

    # Average importance across all targets
    mean_importance = pd.DataFrame(feature_importances).mean(axis=1)
    important_features = pd.DataFrame({
        'Feature': input_features.columns,
        'Importance': mean_importance
    }).sort_values(by='Importance', ascending=False)
    
    selected_features = important_features.head(num_features)['Feature'].tolist()
    return selected_features

# -------------------------------------------------------
# Function: Evaluate Model using 5-Fold Cross Validation with SVM
# -------------------------------------------------------
def evaluate_features_cv(num_features, cv=5):
    # Select features based on SVM importance
    selected_features = get_important_features_svm(input_features, targets, num_features)
    X_selected = pd.DataFrame(X_scaled, columns=input_features.columns)[selected_features]
    
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    fold_f1_scores = []
    fold_accuracy_scores = []
    
    for train_index, test_index in kf.split(X_selected):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = targets.iloc[train_index], targets.iloc[test_index]
        
        # Use SVC with a linear kernel in a multi-output setting
        svm_model = SVC(kernel="linear", random_state=42)
        multi_target_model_svm = MultiOutputClassifier(svm_model)
        multi_target_model_svm.fit(X_train, y_train)
        y_pred = multi_target_model_svm.predict(X_test)
        
        fold_f1 = []
        fold_acc = []
        for col in targets.columns:
            idx = targets.columns.get_loc(col)
            fold_f1.append(f1_score(y_test[col], y_pred[:, idx], average='weighted', zero_division=0))
            fold_acc.append(accuracy_score(y_test[col], y_pred[:, idx]))
        fold_f1_scores.append(np.mean(fold_f1))
        fold_accuracy_scores.append(np.mean(fold_acc))
    
    overall_f1 = np.mean(fold_f1_scores)
    overall_accuracy = np.mean(fold_accuracy_scores)
    return overall_f1, overall_accuracy, selected_features

# ---------------------------------------------------------
# Loop Over a Range of Feature Counts to Find the Best Value
# ---------------------------------------------------------
best_num = None
best_f1 = -1
best_accuracy = -1
best_features = None
results = {}

# Try feature counts from 10 up to the total number of input features, in steps of 10
for num in range(10, input_features.shape[1] + 1, 10):
    print(f"Evaluating model with {num} features using 5-fold CV...")
    current_f1, current_acc, current_features = evaluate_features_cv(num, cv=5)
    results[num] = {"f1": current_f1, "accuracy": current_acc}
    print(f"  Average weighted F1 Score: {current_f1:.4f}")
    print(f"  Average Accuracy: {current_acc:.4f}")
    
    # Select best feature count based on F1 score
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_accuracy = current_acc
        best_num = num
        best_features = current_features

print("\nOptimization complete.")
print(f"Best number of features: {best_num}")
print(f"Best CV Average weighted F1 Score: {best_f1:.4f}")
print(f"Best CV Average Accuracy: {best_accuracy:.4f}")
print("Selected Features:", best_features)

# ---------------------------------------------------------
# Final Model Training with the Optimal Number of Features
# ---------------------------------------------------------
print("\nTraining final model using the best number of features...")

X_selected_final = pd.DataFrame(X_scaled, columns=input_features.columns)[best_features]

# Held-out test evaluation using a fixed 80/20 split
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X_selected_final, targets, test_size=0.2, random_state=42
)

svm_final = SVC(kernel="rbf", random_state=42)
multi_target_final = MultiOutputClassifier(svm_final)
multi_target_final.fit(X_train_final, y_train_final)
y_pred_final = multi_target_final.predict(X_test_final)

print("\nFinal Evaluation on Held-out Test Set (Single Split):")
for category, questions in categories.items():
    acc_list = []
    f1_list = []
    for q in questions:
        if q in targets.columns:
            idx = targets.columns.get_loc(q)
            acc_list.append(accuracy_score(y_test_final[q], y_pred_final[:, idx]))
            f1_list.append(f1_score(y_test_final[q], y_pred_final[:, idx], average='weighted', zero_division=0))
    if acc_list and f1_list:
        print(f"Category: {category}")
        print(f"  Average Accuracy: {np.mean(acc_list):.4f}")
        print(f"  Average F1 Score: {np.mean(f1_list):.4f}")
    else:
        print(f"Category: {category} - No Data Available")

# ---------------------------------------------------------
# Final Evaluation using 5-Fold Cross Validation (Category-wise)
# ---------------------------------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = {cat: {"acc": [], "f1": []} for cat in categories.keys()}

for train_index, test_index in kf.split(X_selected_final):
    X_train_cv, X_test_cv = X_selected_final.iloc[train_index], X_selected_final.iloc[test_index]
    y_train_cv, y_test_cv = targets.iloc[train_index], targets.iloc[test_index]
    
    multi_target_final.fit(X_train_cv, y_train_cv)
    y_pred_cv = multi_target_final.predict(X_test_cv)
    
    for cat, questions in categories.items():
        cat_acc = []
        cat_f1 = []
        for q in questions:
            if q in targets.columns:
                idx = targets.columns.get_loc(q)
                cat_acc.append(accuracy_score(y_test_cv[q], y_pred_cv[:, idx]))
                cat_f1.append(f1_score(y_test_cv[q], y_pred_cv[:, idx], average='weighted', zero_division=0))
        if cat_acc:
            cv_results[cat]["acc"].append(np.mean(cat_acc))
        if cat_f1:
            cv_results[cat]["f1"].append(np.mean(cat_f1))

print("\nFinal Evaluation using 5-Fold CV (Category-wise):")
for cat in cv_results:
    final_cat_acc = np.mean(cv_results[cat]["acc"])
    final_cat_f1 = np.mean(cv_results[cat]["f1"])
    print(f"Category: {cat}")
    print(f"  Average Accuracy: {final_cat_acc:.4f}")
    print(f"  Average F1 Score: {final_cat_f1:.4f}")


Evaluating model with 10 features using 5-fold CV...
  Average weighted F1 Score: 0.4288
  Average Accuracy: 0.5044
Evaluating model with 20 features using 5-fold CV...
  Average weighted F1 Score: 0.4401
  Average Accuracy: 0.4784
Evaluating model with 30 features using 5-fold CV...
  Average weighted F1 Score: 0.4295
  Average Accuracy: 0.4421
Evaluating model with 40 features using 5-fold CV...
  Average weighted F1 Score: 0.4291
  Average Accuracy: 0.4279
Evaluating model with 50 features using 5-fold CV...
  Average weighted F1 Score: 0.4176
  Average Accuracy: 0.4106

Optimization complete.
Best number of features: 20
Best CV Average weighted F1 Score: 0.4401
Best CV Average Accuracy: 0.4784
Selected Features: ['Type_aviation_maintenance_license_Civil', 'Type_aviation_maintenance_license_Military', 'Total_Years_of_experience_encoded', 'Type_aviation_maintenance_license_Civil and Military', 'PS_Extraversion_31R', 'Off_aircraft', 'PS_Extraversion_16', 'PS_Openness_25', 'PS_Conscien