In [None]:
%pip install ipykernel

In [None]:
!nvidia-smi

In [None]:
%pip install pandas
%pip install matplotlib seaborn scikit-learn
%pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import xgboost as xgb


# Load the data
df = pd.read_csv('train.csv')
df = df.dropna()  # Drop any rows with NaN values

In [None]:
#to find outliers boxplot is used
sns.boxplot(x=df['Age'])
plt.show()
sns.boxplot(x=df['Income'])
plt.show()
sns.boxplot(x=df['LoanAmount'])
plt.show()
sns.boxplot(x=df['CreditScore'])
plt.show()
sns.boxplot(x=df['MonthsEmployed'])
plt.show()
sns.boxplot(x=df['NumCreditLines'])
plt.show()
sns.boxplot(x=df['InterestRate'])
plt.show()
sns.boxplot(x=df['LoanTerm'])
plt.show()
sns.boxplot(x=df['DTIRatio'])
plt.show()
sns.boxplot(x=df['Default'])
plt.show()

## Outlier Detection

In this section, we use boxplots to detect outliers in various numeric columns of the dataset. Boxplots are a useful visualization tool for identifying outliers, as they display the distribution of data and highlight any values that fall significantly outside the expected range. The columns analyzed for outliers include:

- Age
- Income
- LoanAmount
- CreditScore
- MonthsEmployed
- NumCreditLines
- InterestRate
- LoanTerm
- DTIRatio
- Default

By examining these boxplots, we can identify and potentially address any outliers that may affect the performance of our machine learning models.


## Data Preprocessing and Feature Engineering

In this section, we perform data preprocessing and feature engineering to prepare the dataset for machine learning models. The steps include:

1. **Selecting Numeric Columns**: We filter the dataframe to include only columns with numeric data types (`float64` and `int64`).

2. **Dropping Unnecessary Columns**: We drop the `LoanID` column as it is an identifier and not useful for modeling.

3. **Separating Features and Target**: We separate the features (predictor variables) and the target variable (`Default`).

4. **Standardizing the Features**: We scale the features to have a mean of 0 and a standard deviation of 1 using `StandardScaler`.

5. **Combining Scaled Features and Target**: We combine the scaled features with the target variable to form a new dataframe.

6. **Ensuring Binary Target**: We check that the `Default` column contains only binary values (0 or 1).

7. **Summary Statistics**: We display the summary statistics of the scaled dataframe to understand the distributions and statistics of all columns.

These preprocessing steps are crucial for ensuring that the data is clean, consistent, and suitable for training machine learning models.


In [None]:
# Select only the numeric columns
# Here we filter the dataframe `df` to include only columns with numeric data types (i.e., float64 and int64).
df_numeric = df.select_dtypes(include=['float64', 'int64'])

# Correlation matrix and heatmap
# Calculate the correlation matrix for the scaled dataframe. This matrix shows the linear relationships between all pairs of features.
corr = df_scaled.corr()  # Calculate the correlation matrix for the scaled features and target variable.

# Plot the correlation heatmap
# Create a heatmap visualization to easily understand the relationships between variables.
# The `annot=True` option shows the correlation coefficient values in the heatmap, while `fmt=".6f"` formats them to 6 decimal places.
# The `cmap='coolwarm'` sets the color scheme, and `linewidths=0.5` adds some separation between the cells for clarity.
plt.figure(figsize=(12, 10))  # Set the figure size for better readability of the heatmap.
sns.heatmap(corr, annot=True, fmt=".6f", cmap='coolwarm', linewidths=0.5)  # Generate the heatmap with the correlation matrix.
plt.show()  # Display the heatmap.

# Print the correlation matrix
# This prints the raw correlation matrix values, so you can also inspect the relationships between features in the console.
print(corr)

# Drop the columns that have bad correlation if they exist
# Sometimes, highly correlated features (multicollinearity) can harm the performance of certain models (e.g., linear regression).
# In this step, you could decide to drop such columns. 
# The example here shows columns 'MonthsEmployed' and 'NumCreditLines' as potential candidates for removal.
# However, we have commented it out for now, and the actual columns to drop can be decided based on the correlation matrix.

#-----------------------------------------------------------------------------------------
columns_to_drop = ['MonthsEmployed']#, 'NumCreditLines']  # Example columns that might have high correlation with others.
df_scaled = df_scaled.drop(columns=[col for col in columns_to_drop if col in df_scaled.columns])  # Drop the selected columns.
#did not drop any columns cause they gave worse results
#------------------------------------------------------------------------------------------------------

# Display the summary statistics of the scaled DataFrame
# This will give us an overview of the data distributions after scaling (mean, standard deviation, min, max, etc.).
# It helps us check for any extreme outliers or issues with the data after scaling.
print(df_scaled.describe())



## Correlation Analysis

In this section, we perform correlation analysis to understand the relationships between different numeric features in the dataset. The steps include:

1. **Selecting Numeric Columns**: We filter the dataframe to include only columns with numeric data types (`float64` and `int64`).

2. **Calculating Correlation Matrix**: We compute the correlation matrix for the scaled dataframe to identify linear relationships between pairs of features.

3. **Plotting Correlation Heatmap**: We visualize the correlation matrix using a heatmap, which helps in easily identifying strong positive or negative correlations between features.

4. **Dropping Highly Correlated Features**: If necessary, we drop features that are highly correlated with others to avoid multicollinearity, which can negatively impact the performance of certain machine learning models.

5. **Summary Statistics**: We display the summary statistics of the scaled dataframe to understand the distributions and statistics of all columns after scaling.

By analyzing the correlations, we can make informed decisions about feature selection and engineering, which are crucial for building robust machine learning models.


In [None]:
# Check for NaN values in the dataset
# Before proceeding with machine learning, it's important to check for missing values (NaNs) in the dataset.
# If there are NaNs, we need to handle them through imputation, removal, or other techniques.
X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop('Default', axis=1), df_scaled['Default'].astype(int), test_size=0.01, random_state=87)
# Split the scaled dataset into training and test sets. We are dropping 'Default' from X (features) and using it as y (target).
# 1% of the data is allocated to the test set using `test_size=0.01`, and the random state ensures reproducibility of the split.

# Check for NaN values in X_train (features)
# Here, we check if there are any NaN values in the training features (X_train).
print("NaN values in X_train:", X_train.isnull().sum().sum())  # `.isnull()` checks for NaNs, `.sum()` counts them, and `.sum().sum()` gives the total number of NaNs in the dataset.

# Check for NaN values in y_train (target)
# We also check for NaN values in the target variable (y_train). Missing target values can cause issues during model training.
print("NaN values in y_train:", y_train.isnull().sum())  # `.isnull()` checks for NaNs, `.sum()` counts them in y_train.

# Check for non-numeric data types in X_train (features)
# Since machine learning models typically require numeric inputs, it's important to verify that there are no non-numeric columns in the feature set.
# This step is crucial, especially after preprocessing, to ensure that only numeric columns are present in the training set.
print("Data types in X_train:", X_train.dtypes)  # `.dtypes` will show the data types of each column in X_train.

# Check the data type in y_train (target)
# Ensure the target variable y_train is in the correct data type (usually int or float for classification).
# For classification, it's important that the target is either integer (for class labels) or float.
print("Data type in y_train:", y_train.dtype)  # `.dtype` will display the data type of the target variable.




## Train-Test Split and Data Validation Grid Search

In this section, we perform the following steps to prepare our data for machine learning:

1. **Train-Test Split**: We split the scaled dataset into training and test sets. We use 99% of the data for training and 1% for testing to ensure we have enough data for model training while keeping a small portion for validation.

2. **Check for NaN Values**: We check for any missing values (NaNs) in both the training features (`X_train`) and the target variable (`y_train`). Handling NaNs is crucial as they can cause issues during model training.

3. **Check Data Types**: We verify that all columns in the training features (`X_train`) are numeric, as most machine learning models require numeric inputs. We also ensure that the target variable (`y_train`) is of the correct data type (integer) for classification.

4. **Grid Search for Hyperparameter Tuning**: We use GridSearchCV to perform hyperparameter tuning for our Random Forest model. Grid search tests multiple combinations of hyperparameters to find the best configuration for our model. We define a parameter grid and use 3-fold cross-validation to evaluate each combination.

5. **Evaluate Best Model**: After finding the best hyperparameters, we evaluate the best model on the test set to check its performance. We calculate the accuracy of the model's predictions on the test data.



In [None]:
#gridsearch

# Prepare the data, ensuring y is integer type
X = df_scaled.drop('Default', axis=1)           # Define features (X) from the DataFrame
y = df_scaled['Default'].astype(int)             # Convert the target variable to integer type

# Split the data into training and test sets
# Here we use 80% of the data for training and 20% for testing
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],              # Number of trees in the forest
    'max_depth': [2, 5, 10],                     # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],             # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4]                # Minimum samples required to be a leaf node
}

# Initialize the RandomForestClassifier
# 'class_weight="balanced"' helps handle class imbalance by adjusting weights
rf = RandomForestClassifier(random_state=42, class_weight='balanced')

# Set up GridSearchCV
# Grid search will test all parameter combinations specified in param_grid
# 'cv=3' means 3-fold cross-validation, 'n_jobs=-1' uses all CPU cores, and 'verbose=2' for progress output
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the model with GridSearchCV to find the best hyperparameters
grid_search.fit(X_train, y_train)

# Print the best parameters and model found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)
print("Best Cross-validation Score:", grid_search.best_score_)

# Print detailed results for each parameter combination
print("Grid Search Results:", grid_search.cv_results_)               # Complete details of all configurations
print("Mean Test Scores:", grid_search.cv_results_['mean_test_score']) # Mean cross-validation scores for each configuration

# Evaluate the best model on the test set
# grid_search.best_estimator_ contains the optimized Random Forest model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)                # Predict on the test set
accuracy = accuracy_score(y_test, y_pred)       # Calculate accuracy of predictions
print("Test Accuracy:", accuracy)

In [None]:
#grid search submission
# Load and preprocess the test dataset
df_test = pd.read_csv('test.csv')
df_test = df_test.dropna()

# Ensure all columns are numeric
df_test_numeric = df_test.select_dtypes(include=[np.number])

# Match test data columns to training feature columns
# Drop any columns in df_test_numeric that are not in features_scaled.columns
df_test_numeric = df_test_numeric[features.columns]  # Ensure test columns align with training features

# Standardize the test data using the fitted scaler
df_test_scaled = scaler.transform(df_test_numeric)
df_test_scaled = pd.DataFrame(df_test_scaled, columns=features.columns)

# Drop the columns that were dropped during training
columns_to_drop = ['MonthsEmployed', 'NumCreditLines']
df_test_scaled = df_test_scaled.drop(columns=[col for col in columns_to_drop if col in df_test_scaled.columns])

# Check for NaN values in the test dataset
print("NaN values in df_test_scaled:", df_test_scaled.isnull().sum().sum())

# Predict the target variable
y_pred = best_rf.predict(df_test_scaled)

# Add the predictions to the original test DataFrame
df_test['Default'] = y_pred

# Select only the LoanID and Default columns for submission
df_submission = df_test[['LoanID', 'Default']]

# Save the submission DataFrame to a CSV file
df_submission.to_csv('sub.csv', index=False)

# Print the submission DataFrame
print(df_submission)

## XGBoost with Grid Search

In this section, we utilize XGBoost, a powerful gradient boosting algorithm, along with Grid Search for hyperparameter tuning. The steps include:

1. **Data Preparation**: We prepare the data by separating the features and target variable, and splitting the dataset into training and test sets.

2. **Defining Parameter Grid**: We define a parameter grid for XGBoost, specifying different values for hyperparameters such as the number of trees (`n_estimators`), learning rate (`learning_rate`), maximum depth of trees (`max_depth`), subsampling rate (`subsample`), and column sampling rate (`colsample_bytree`).

3. **Initializing XGBoost Classifier**: We initialize the XGBoost classifier with GPU support to speed up the training process.

4. **Grid Search with Cross-Validation**: We set up `GridSearchCV` to perform an exhaustive search over the specified parameter grid, using 3-fold cross-validation to evaluate each combination of hyperparameters.

5. **Training the Model**: We fit the model using `GridSearchCV` to find the best hyperparameters and train the XGBoost model.

6. **Evaluating the Best Model**: After finding the best hyperparameters, we evaluate the best model on the test set to check its performance. We calculate the accuracy of the model's predictions on the test data.

By using Grid Search with XGBoost, we aim to optimize the model's performance by finding the best combination of hyperparameters, ensuring robust and accurate predictions.

In [None]:
#xg+grid
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Prepare the data
X = df_scaled.drop('Default', axis=1)          # Select features for training
y = df_scaled['Default'].astype(int)            # Convert the target variable to integer type

# Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a balanced parameter grid for XGBoost
# param_grid_xgb = {
#     'n_estimators': [50, 100, 150],             # Three options for number of trees
#     'learning_rate': [0.05, 0.1, 0.2],          # Three values for learning rate
#     'max_depth': [3, 5, 7],                     # Three levels for max depth
#     'subsample': [0.8, 1.0],                    # Two options for subsampling rate
#     'colsample_bytree': [0.8, 1.0]              # Two options for column sampling
# }

param_grid_xgb = {
    'n_estimators': [50, 100, 150],               # Adding an extra value for number of trees
    'learning_rate': [0.05, 0.1, 0.2],            # Adding a lower learning rate for finer control
    'max_depth': [3, 5, 7],                       # Expanding max depth options for model complexity
    'subsample': [0.7, 0.8, 1.0],                 # Adding more values for subsampling rate
    'colsample_bytree': [0.7, 0.8, 1.0]           # Adding more values for column sampling
}


# Initialize the XGBoost Classifier with GPU support
xgb_clf = xgb.XGBClassifier(tree_method='gpu_hist', random_state=19)

# Set up GridSearchCV with XGBoost
grid_search_gb = GridSearchCV(estimator=xgb_clf, param_grid=param_grid_xgb, cv=3, n_jobs=-1, verbose=2)

# Fit the model with GridSearchCV
grid_search_gb.fit(X_train, y_train)

# Print the best parameters and score found by GridSearchCV
print("Best Parameters for XGBoost:", grid_search_gb.best_params_)
print("Best Estimator for XGBoost:", grid_search_gb.best_estimator_)
print("Best Cross-validation Score for XGBoost:", grid_search_gb.best_score_)

# Define best_gb as the best estimator from grid search
best_gb = grid_search_gb.best_estimator_

# Evaluate the best model on the test set
y_pred_xgb = best_gb.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("Test Accuracy with XGBoost:", accuracy_xgb)

In [None]:
#xg+grid submission no columsn dropped
df_test = pd.read_csv('test.csv')
df_test = df_test.dropna()

# Ensure all columns are numeric
df_test_numeric = df_test.select_dtypes(include=[np.number])

# Match test data columns to training feature columns
# Drop any columns in df_test_numeric that are not in features_scaled.columns
df_test_numeric = df_test_numeric[features.columns]  # Ensure test columns align with training

# Standardize the test data using the fitted scaler
df_test_scaled = scaler.transform(df_test_numeric)
df_test_scaled = pd.DataFrame(df_test_scaled, columns=features.columns)
#drop the columns that have bad correlation if they exist
columns_to_drop = ['MonthsEmployed', 'NumCreditLines']
df_test_scaled = df_test_scaled.drop(columns=[col for col in columns_to_drop if col in df_test_scaled.columns])
# Check for NaN values in the test dataset
print("NaN values in df_test_scaled:", df_test_scaled.isnull().sum().sum())

# Predict the target variable
y_pred_gb = best_gb.predict(df_test_scaled)

# Add the predictions to the original test DataFrame
df_test['Default'] = y_pred_gb

# Select only the LoanID and Default columns for submission
df_submission_gb = df_test[['LoanID', 'Default']]
# Save the submission DataFrame to a CSV file
df_submission_gb.to_csv('sub_gb.csv', index=False)

# Print the submission DataFrame
print(df_submission_gb)

## XGBoost Model Training and Evaluation

In this section, we focus on training and evaluating an XGBoost model. The steps include:

1. **Data Preparation**: We prepare the data by separating the features and target variable, and splitting the dataset into training and test sets.

2. **Defining XGBoost Parameters**: We define a set of hyperparameters for the XGBoost model, including the number of trees (`n_estimators`), learning rate (`learning_rate`), maximum depth of trees (`max_depth`), subsampling rate (`subsample`), column sampling rate (`colsample_bytree`), and regularization parameters (`gamma`, `min_child_weight`, `reg_alpha`, `reg_lambda`).

3. **Training the XGBoost Model**: We initialize the XGBoost classifier with the defined parameters and train it on the training data.

4. **Evaluating the Model**: We evaluate the trained XGBoost model on the test set by predicting the target variable and calculating the accuracy of the predictions.

By using XGBoost, we aim to leverage its powerful gradient boosting algorithm to achieve robust and accurate predictions.

In [None]:
#cell 16
#xgbost with no dropped columsn s
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_scaled.drop('Default', axis=1), df_scaled['Default'], test_size=0.01, random_state=87)

# Define a more complex and fine-tuned XGBoost model
xgb_params = {
    'n_estimators': 1000,                        # Use more trees to improve model complexity
    'learning_rate': 0.01,                       # Lower learning rate for better optimization
    'max_depth': 12,                             # Set deeper trees for more complex patterns
    'subsample': 0.85,                           # Slightly lower subsample to avoid overfitting
    'colsample_bytree': 0.85,                    # Use 85% of the features per tree
    'gamma': 5,                                  # Stronger regularization to prevent overfitting
    'min_child_weight': 7,                       # Prevent overfitting by increasing min_child_weight
    'reg_alpha': 0.5,                            # L1 regularization
    'reg_lambda': 1,                             # L2 regularization
    'tree_method': 'gpu_hist',                   # Use GPU for faster training if available
    'random_state': 87                           # Ensure reproducibility
}

#drop the columns that have bad correlation if they exist
# columns_to_drop = ['MonthsEmployed', 'NumCreditLines']
# df_scaled = df_scaled.drop(columns=[col for col in columns_to_drop if col in df_scaled.columns])

# Initialize and train the XGBoost classifier
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train, y_train)


# Save the model to best_gb
best_gb = xgb_clf

# Predict the target variable on the test data
y_pred_xgb = best_gb.predict(X_test)

# Calculate and print accuracy
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("Test Accuracy with XGBoost:", accuracy_xgb)

In [None]:
#cell 17
#xg submission
# Now use the trained model to predict on the test data (df_test)
df_test = pd.read_csv('test.csv')
df_test = df_test.dropna()

# Ensure all columns are numeric and align with the features in the training set
df_test_numeric = df_test.select_dtypes(include=[np.number])

# Drop 'LoanID' from the test set as well (it is not needed for prediction)
#df_test_numeric = df_test_numeric.drop('LoanID', axis=1)

# Standardize the test data
df_test_scaled = scaler.transform(df_test_numeric)
df_test_scaled = pd.DataFrame(df_test_scaled, columns=features.columns)

#-----------------------------------------------------------------------------------------
# Drop the columns that were dropped during training
# columns_to_drop = ['MonthsEmployed', 'NumCreditLines']
# df_test_scaled = df_test_scaled.drop(columns=[col for col in columns_to_drop if col in df_test_scaled.columns])

#-----------------------------------------------------------------------------------------


# Check for NaN values in the test dataset
print("NaN values in df_test_scaled:", df_test_scaled.isnull().sum().sum())

# Predict the target variable for the test set
y_pred_gb = best_gb.predict(df_test_scaled)

# Add the predictions to the original test DataFrame
df_test['Default'] = y_pred_gb

# Select only the LoanID and Default columns for submission
df_submission_gb = df_test[['LoanID', 'Default']]

# Save the submission DataFrame to a CSV file
df_submission_gb.to_csv('sub_gb2.csv', index=False)

# Print the submission DataFrame
print(df_submission_gb)

## Stacking Ensemble Model

In this section, we implement a stacking ensemble model to improve the prediction accuracy by combining multiple base models. The steps include:

1. **Data Preparation**: We prepare the data by separating the features and target variable, and splitting the dataset into training and test sets.

2. **Defining Base Models**: We define two base models for the stacking ensemble:
    - Random Forest Classifier
    - Gradient Boosting Classifier

3. **Stacking Classifier**: We create a stacking classifier that combines the predictions from the base models using a meta-model (Logistic Regression).

4. **Training the Stacking Model**: We fit the stacking model on the training data.

5. **Evaluating the Stacking Model**: We evaluate the stacking model on the test set by predicting the target variable and calculating the accuracy of the predictions.

By using a stacking ensemble, we aim to leverage the strengths of multiple models to achieve better performance and more robust predictions.

In [None]:
#stacking
# Prepare the data
# Separate features and target variable from the scaled DataFrame
X = df_scaled.drop('Default', axis=1)
y = df_scaled['Default'].astype(int)  # Convert target variable to integer type for classification

# Split the data into training and test sets
# Using 80% of data for training and 20% for testing, with a fixed random state for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the base models for stacking
base_estimators = [
    ('random_forest', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)),  # Random Forest model
    ('gradient_boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42))  # Gradient Boosting model
]

# Define the stacking classifier
# This ensemble model combines predictions from base models using a meta-model (Logistic Regression)
stacking_model = StackingClassifier(
    estimators=base_estimators,               # Base models in the stacking ensemble
    final_estimator=LogisticRegression(),     # Meta-model to combine predictions from base models
    cv=3,                                     # 3-fold cross-validation for meta-model training
    n_jobs=-1                                 # Use all available cores for parallel processing
)

# Fit the stacking model on the training data
stacking_model.fit(X_train, y_train)

# Predict the target on the test set using the trained stacking model
y_pred_stack = stacking_model.predict(X_test)

# Calculate accuracy to evaluate the stacking model's performance on the test set
stack_accuracy = accuracy_score(y_test, y_pred_stack)
print("Test Accuracy with Stacking Ensemble:", stack_accuracy)

In [None]:
# Load the test dataset and drop any rows with missing values
df_test = pd.read_csv('test.csv')
df_test = df_test.dropna()

# Select only numeric columns from the test dataset
df_test_numeric = df_test.select_dtypes(include=[np.number])

# Match test data columns to training feature columns by selecting only columns present in `features`
# This ensures the test data has the same columns as the training data used in modeling
df_test_numeric = df_test_numeric[features.columns]  # Ensure test columns align with training

# Standardize the test data using the previously fitted scaler
# The scaler object was trained on the training data and now transforms the test data to match that scaling
df_test_scaled = scaler.transform(df_test_numeric)
df_test_scaled = pd.DataFrame(df_test_scaled, columns=features.columns)

# Drop any columns that had low correlation and were dropped from the training dataset, if they are present
# This keeps the test set consistent with the model’s feature expectations
columns_to_drop = ['MonthsEmployed', 'NumCreditLines']
df_test_scaled = df_test_scaled.drop(columns=[col for col in columns_to_drop if col in df_test_scaled.columns])

# Check if there are any NaN values in the test dataset after processing
print("NaN values in df_test_scaled:", df_test_scaled.isnull().sum().sum())

# Use the stacking model to predict the 'Default' status on the test dataset
# This model combines predictions from Random Forest and Gradient Boosting
y_pred_stack = stacking_model.predict(df_test_scaled)

# Add the predictions as a new column in the original test DataFrame
df_test['Default'] = y_pred_stack

# Select only the LoanID and Default columns for the final submission
df_submission_stack = df_test[['LoanID', 'Default']]

# Save the prepared submission DataFrame to a CSV file
# The CSV can be submitted with predictions for each loan in the test dataset
df_submission_stack.to_csv('sub_stack.csv', index=False)