In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Import our preparation module
from hr_analytics_preparation import prepare_data

# Load and prepare data
df = pd.read_csv('HR_data.csv')
X_train, X_test, y_train, y_test, preprocessor = prepare_data(df)

# Initialize algorithms to investigate
algorithms = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SGD Classifier': SGDClassifier(random_state=42, max_iter=1000, tol=1e-3),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Evaluate each algorithm using cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

print("Investigating 6 Algorithms...")
print("="*50)

for name, model in algorithms.items():
    print(f"\nEvaluating {name}...")
    
    # Cross-validation scores
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
    
    # Train and evaluate on test set for additional metrics
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results[name] = {
        'CV F1 Mean': cv_scores.mean(),
        'CV F1 Std': cv_scores.std(),
        'Test Accuracy': accuracy_score(y_test, y_pred),
        'Test Precision': precision_score(y_test, y_pred),
        'Test Recall': recall_score(y_test, y_pred),
        'Test F1': f1_score(y_test, y_pred)
    }
    
    print(f"CV F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"Test F1: {results[name]['Test F1']:.4f}")

# Create results comparison
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('CV F1 Mean', ascending=False)

print("\n" + "="*50)
print("ALGORITHM RANKING (by CV F1 Score):")
print("="*50)
print(results_df[['CV F1 Mean', 'CV F1 Std', 'Test F1', 'Test Accuracy']])

# Select top 2 algorithms
top_2 = results_df.head(2).index.tolist()
print(f"\nTop 2 algorithms selected for detailed training: {top_2}")

# Save the results
results_df.to_csv('algorithm_investigation_results.csv')
print("\nResults saved to 'algorithm_investigation_results.csv'")

ModuleNotFoundError: No module named 'hr_analytics_preparation'