In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential, save_model, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.utils import get_custom_objects
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score
import joblib
import scipy.stats as st
import threading
import time

# Register custom metric with Keras
def f1_metric(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1 - y_true) * (1 - y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2 * p * r / (p + r + K.epsilon())
    return K.mean(f1)

get_custom_objects().update({"f1_metric": f1_metric})

# Global Variables
dropout_value = 0.5
epochs_value = 10000
patience_value = 100
random_state_value = 50
max_search_time = 300  # Maximum time for each RandomizedSearchCV in seconds (5 minutes)

# Custom timeout handler using threading
class TimeoutException(Exception):
    pass

def run_with_timeout(func, args=(), kwargs={}, timeout=300):
    result = [None]
    exception = [None]
    
    def wrapper():
        try:
            result[0] = func(*args, **kwargs)
        except Exception as e:
            exception[0] = e
    
    thread = threading.Thread(target=wrapper)
    thread.start()
    thread.join(timeout)
    
    if thread.is_alive():
        thread.join()
        raise TimeoutException("Timeout reached")
    
    if exception[0]:
        raise exception[0]
    
    return result[0]

# Load the training data
print("Loading training data...")
train_file_path = 'train.csv'  # replace with your actual file path
train_data = pd.read_csv(train_file_path)
print("Training data loaded.")

# Separate features and target variable
X_train = train_data.drop(columns=['Target'])
y_train = train_data['Target']

# Handle missing values
X_train = X_train.fillna(X_train.mean())

# Normalize the features
print("Normalizing features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
print("Features normalized.")

# Build the neural network model
print("Building neural network model...")
def build_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_value))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_value))
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_value))
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=[f1_metric])
    return model

# Initialize the neural network model
nn_model = build_nn_model(X_train_scaled.shape[1])

# Train the neural network model
print("Training neural network model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=patience_value, restore_best_weights=True, min_delta=0.001)
model_checkpoint = ModelCheckpoint('best_nn_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001, verbose=1)

history = nn_model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=epochs_value, batch_size=32, callbacks=[early_stopping, model_checkpoint, reduce_lr])

# Load the best neural network model
print("Loading best neural network model...")
nn_model.load_weights('best_nn_model.h5')

# Create a wrapper for the neural network model to use it in the StackingClassifier
class NeuralNetworkClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model
        self.classes_ = [0, 1]  # Add the classes_ attribute

    def fit(self, X, y, epochs=100, batch_size=32, verbose=0):
        self.model.fit(X, y, epochs=epochs, batch_size=batch_size, verbose=verbose)
        return self

    def predict(self, X):
        return (self.model.predict(X) > 0.5).astype(int).reshape(-1)

    def predict_proba(self, X):
        probas = self.model.predict(X)
        return np.hstack(((1 - probas), probas))

# Initialize the classifiers with hyperparameter tuning (excluding SVC and GBC)
print("Initializing and tuning classifiers...")
nn_wrapper = NeuralNetworkClassifier(nn_model)

param_dist = {
    'rf': {
        'n_estimators': st.randint(50, 200),
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': st.randint(4, 20),
        'criterion': ['gini', 'entropy']
    },
    'knn': {
        'n_neighbors': st.randint(1, 30),
        'weights': ['uniform', 'distance']
    },
    'adb': {
        'n_estimators': st.randint(50, 200),
        'learning_rate': st.uniform(0.01, 1.5)
    },
    'etc': {
        'n_estimators': st.randint(50, 200),
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': st.randint(4, 20),
        'criterion': ['gini', 'entropy']
    },
    'xgb': {
        'n_estimators': st.randint(50, 200),
        'learning_rate': st.uniform(0.01, 1.5),
        'max_depth': st.randint(3, 20)
    },
    'cat': {
        'depth': st.randint(4, 10),
        'learning_rate': st.uniform(0.01, 1.5),
        'iterations': st.randint(50, 200)
    },
    'dt': {
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth': st.randint(4, 20),
        'criterion': ['gini', 'entropy']
    }
}

# Define base models
base_models = {
    'rf': RandomForestClassifier(random_state=random_state_value),
    'knn': KNeighborsClassifier(),
    'svc': SVC(probability=True, random_state=random_state_value),  # SVC without hyperparameter tuning
    'adb': AdaBoostClassifier(random_state=random_state_value),
    'gbc': GradientBoostingClassifier(random_state=random_state_value),  # GBC without hyperparameter tuning
    'etc': ExtraTreesClassifier(random_state=random_state_value),
    'xgb': XGBClassifier(random_state=random_state_value),
    'cat': CatBoostClassifier(verbose=0, random_state=random_state_value),
    'dt': DecisionTreeClassifier(random_state=random_state_value)
}

# Perform RandomizedSearchCV (excluding SVC and GBC) with timeout
best_estimators = []
for name, model in base_models.items():
    if name not in ['svc', 'gbc']:  # Skip hyperparameter tuning for SVC and GBC
        print(f"Hyperparameter tuning for {name}...")
        rand_search = RandomizedSearchCV(model, param_distributions=param_dist[name], n_iter=10, scoring='f1', cv=5, random_state=random_state_value)
        try:
            best_estimator = run_with_timeout(rand_search.fit, args=(X_train_scaled, y_train), timeout=max_search_time)
            best_estimators.append((name, best_estimator.best_estimator_))
        except TimeoutException:
            print(f"Timeout reached for {name}. Using default parameters.")
            best_estimators.append((name, model))
    else:
        best_estimators.append((name, model))

# Add the neural network wrapper
best_estimators.append(('nn', nn_wrapper))

# Create the stacking classifier
stacking_clf = StackingClassifier(
    estimators=best_estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

# Train the StackingClassifier on the entire training data
print("Training StackingClassifier on the entire training data...")
stacking_clf.fit(X_train_scaled, y_train)

# Save the StackingClassifier model
print("Saving StackingClassifier model...")
stacking_model_path = "stacking_classifier_model.pkl"
joblib.dump(stacking_clf, stacking_model_path)

# Load the test data
print("Loading test data...")
test_file_path = 'test.csv'  # replace with your actual file path
test_data = pd.read_csv(test_file_path)
print("Test data loaded.")

# Handle missing values in the test data
X_test_final = test_data.fillna(test_data.mean())

# Normalize the test data
print("Normalizing test data...")
X_test_final_scaled = scaler.transform(X_test_final)
print("Test data normalized.")

# Make predictions with the StackingClassifier model
print("Making predictions with the StackingClassifier model...")
y_pred_final = stacking_clf.predict(X_test_final_scaled)

# Save predictions to a CSV file
print("Saving predictions...")
output = pd.DataFrame({'ID': test_data['ID'], 'Target': y_pred_final})
output.to_csv('predictions.csv', index=False)
print("Predictions saved to predictions.csv.")


Loading training data...
Training data loaded.
Normalizing features...
Features normalized.
Building neural network model...
Training neural network model...
Epoch 1/10000
Epoch 1: val_loss improved from inf to 0.69440, saving model to best_nn_model.h5
Epoch 2/10000
Epoch 2: val_loss did not improve from 0.69440


  saving_api.save_model(


Epoch 3/10000
Epoch 3: val_loss did not improve from 0.69440
Epoch 4/10000
Epoch 4: val_loss improved from 0.69440 to 0.69294, saving model to best_nn_model.h5
Epoch 5/10000
Epoch 5: val_loss improved from 0.69294 to 0.69036, saving model to best_nn_model.h5
Epoch 6/10000
Epoch 6: val_loss improved from 0.69036 to 0.68733, saving model to best_nn_model.h5
Epoch 7/10000
Epoch 7: val_loss improved from 0.68733 to 0.68479, saving model to best_nn_model.h5
Epoch 8/10000
Epoch 8: val_loss improved from 0.68479 to 0.68413, saving model to best_nn_model.h5
Epoch 9/10000
Epoch 9: val_loss did not improve from 0.68413
Epoch 10/10000
Epoch 10: val_loss improved from 0.68413 to 0.68193, saving model to best_nn_model.h5
Epoch 11/10000
Epoch 11: val_loss improved from 0.68193 to 0.68050, saving model to best_nn_model.h5
Epoch 12/10000
Epoch 12: val_loss improved from 0.68050 to 0.68045, saving model to best_nn_model.h5
Epoch 13/10000
Epoch 13: val_loss improved from 0.68045 to 0.67820, saving model

Epoch 28: val_loss improved from 0.66789 to 0.66705, saving model to best_nn_model.h5
Epoch 29/10000
Epoch 29: val_loss improved from 0.66705 to 0.66480, saving model to best_nn_model.h5
Epoch 30/10000
Epoch 30: val_loss did not improve from 0.66480
Epoch 31/10000
Epoch 31: val_loss improved from 0.66480 to 0.66464, saving model to best_nn_model.h5
Epoch 32/10000
Epoch 32: val_loss did not improve from 0.66464
Epoch 33/10000
Epoch 33: val_loss improved from 0.66464 to 0.66456, saving model to best_nn_model.h5
Epoch 34/10000
Epoch 34: val_loss improved from 0.66456 to 0.66058, saving model to best_nn_model.h5
Epoch 35/10000
Epoch 35: val_loss improved from 0.66058 to 0.65954, saving model to best_nn_model.h5
Epoch 36/10000
Epoch 36: val_loss improved from 0.65954 to 0.65890, saving model to best_nn_model.h5
Epoch 37/10000
Epoch 37: val_loss did not improve from 0.65890
Epoch 38/10000
Epoch 38: val_loss improved from 0.65890 to 0.65708, saving model to best_nn_model.h5
Epoch 39/10000
Epo

Epoch 54: val_loss did not improve from 0.65313
Epoch 55/10000
Epoch 55: val_loss did not improve from 0.65313
Epoch 56/10000
Epoch 56: val_loss did not improve from 0.65313
Epoch 57/10000
Epoch 57: val_loss did not improve from 0.65313
Epoch 58/10000
Epoch 58: val_loss did not improve from 0.65313
Epoch 59/10000
Epoch 59: val_loss did not improve from 0.65313
Epoch 60/10000
Epoch 60: val_loss did not improve from 0.65313
Epoch 61/10000
Epoch 61: val_loss did not improve from 0.65313
Epoch 62/10000
Epoch 62: val_loss did not improve from 0.65313
Epoch 63/10000
Epoch 63: val_loss did not improve from 0.65313
Epoch 64/10000
Epoch 64: val_loss did not improve from 0.65313
Epoch 65/10000
Epoch 65: val_loss did not improve from 0.65313
Epoch 66/10000
Epoch 66: val_loss did not improve from 0.65313
Epoch 67/10000
Epoch 67: val_loss did not improve from 0.65313
Epoch 68/10000
Epoch 68: val_loss did not improve from 0.65313
Epoch 69/10000
Epoch 69: val_loss did not improve from 0.65313
Epoch 7

Epoch 82/10000
Epoch 82: val_loss did not improve from 0.65313
Epoch 83/10000
Epoch 83: val_loss did not improve from 0.65313
Epoch 84/10000
Epoch 84: val_loss did not improve from 0.65313
Epoch 85/10000
Epoch 85: val_loss did not improve from 0.65313
Epoch 86/10000
Epoch 86: val_loss did not improve from 0.65313
Epoch 87/10000
Epoch 87: val_loss did not improve from 0.65313
Epoch 88/10000
Epoch 88: val_loss did not improve from 0.65313
Epoch 89/10000
Epoch 89: val_loss did not improve from 0.65313
Epoch 90/10000
Epoch 90: val_loss did not improve from 0.65313
Epoch 91/10000
Epoch 91: val_loss did not improve from 0.65313
Epoch 92/10000
Epoch 92: val_loss did not improve from 0.65313
Epoch 93/10000
Epoch 93: val_loss did not improve from 0.65313
Epoch 94/10000
Epoch 94: val_loss did not improve from 0.65313
Epoch 95/10000
Epoch 95: val_loss did not improve from 0.65313
Epoch 96/10000
Epoch 96: val_loss did not improve from 0.65313
Epoch 97/10000
Epoch 97: val_loss did not improve from 

Epoch 110/10000
Epoch 110: val_loss did not improve from 0.65313
Epoch 111/10000
Epoch 111: val_loss did not improve from 0.65313
Epoch 112/10000
Epoch 112: val_loss did not improve from 0.65313
Epoch 113/10000
Epoch 113: val_loss did not improve from 0.65313
Epoch 114/10000
Epoch 114: val_loss did not improve from 0.65313
Epoch 115/10000
Epoch 115: val_loss did not improve from 0.65313
Epoch 116/10000
Epoch 116: val_loss did not improve from 0.65313
Epoch 117/10000
Epoch 117: val_loss did not improve from 0.65313
Epoch 118/10000
Epoch 118: val_loss did not improve from 0.65313
Epoch 119/10000
Epoch 119: val_loss did not improve from 0.65313
Epoch 120/10000
Epoch 120: val_loss did not improve from 0.65313
Epoch 121/10000
Epoch 121: val_loss did not improve from 0.65313
Epoch 122/10000
Epoch 122: val_loss did not improve from 0.65313
Epoch 123/10000
Epoch 123: val_loss did not improve from 0.65313
Epoch 124/10000
Epoch 124: val_loss did not improve from 0.65313
Epoch 125/10000
Epoch 125

Epoch 138/10000
Epoch 138: val_loss did not improve from 0.65313
Epoch 139/10000
Epoch 139: val_loss did not improve from 0.65313
Epoch 140/10000
Epoch 140: val_loss did not improve from 0.65313
Loading best neural network model...
Initializing and tuning classifiers...
Hyperparameter tuning for rf...


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Hyperparameter tuning for knn...
Hyperparameter tuning for adb...
Hyperparameter tuning for etc...


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Hyperparameter tuning for xgb...
Hyperparameter tuning for cat...


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease l

Hyperparameter tuning for dt...


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\bapti\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Training StackingClassifier on the entire training data...
Saving StackingClassifier model...
Loading test data...
Test data loaded.
Normalizing test data...
Test data normalized.
Making predictions with the StackingClassifier model...
Saving predictions...
Predictions saved to predictions.csv.
