In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

# Load the dataset
data = pd.read_csv("your_dataset.csv")  # Replace with your actual dataset file

# Remove outlier rows using Z-score
z_scores = zscore(data.select_dtypes(include=["int64"]))
abs_z_scores = abs(z_scores)
outlier_indices = (abs_z_scores > 3).any(axis=1)
data_no_outliers = data[~outlier_indices]

# Preprocessing
X = data_no_outliers.drop("Attrition", axis=1)
y = data_no_outliers["Attrition"]

# Encode categorical variables
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a list of classification algorithms with hyperparameter grids
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000), {}),
    ("Random Forest", RandomForestClassifier(), {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]}),
    ("Gradient Boosting", GradientBoostingClassifier(), {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5]}),
    ("SVM", SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    ("K-Nearest Neighbors", KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
    ("Decision Tree", DecisionTreeClassifier(), {'max_depth': [None, 5, 10]}),
    ("Naive Bayes", GaussianNB(), {}),
    ("XGBoost", xgb.XGBClassifier(), {'n_estimators': [100, 200, 300], 'max_depth': [3, 4, 5]})
]

# Define a function to create a neural network model
def create_nn_model():
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=X_train_scaled.shape[1]))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Add the neural network to the list of models
models.append(("Neural Network", KerasClassifier(build_fn=create_nn_model, epochs=10, batch_size=16), {}))

best_models = {}
for name, model, param_grid in models:
    # Train the model without grid search first
    model.fit(X_train_scaled, y_train)

    # Evaluate the model on the test set
    y_pred = model.predict(X_test_scaled)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test accuracy for {name} (before tuning): {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

    if param_grid:
        # Perform GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
        grid_search.fit(X_train_scaled, y_train)

        best_model = grid_search.best_estimator_
        best_models[name] = best_model

        # Print the best parameters and accuracy for each model
        print(f"Best parameters for {name}: {grid_search.best_params_}")
        print(f"Training accuracy for {name}: {grid_search.best_score_:.4f}")

        # Evaluate the model on the test set
        y_pred = best_model.predict(X_test_scaled)
        y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Test accuracy for {name} (after tuning): {accuracy:.4f}")
        print(classification_report(y_test, y_pred))
