In [2]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e11/sample_submission.csv
/kaggle/input/playground-series-s4e11/train.csv
/kaggle/input/playground-series-s4e11/test.csv


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import time
import warnings

warnings.filterwarnings("ignore")

class XGBoostCPUPipeline:
    def __init__(self, random_state=42):
        self.random_state = random_state
        
    def optimize_model(self, X_train, y_train, X_val, y_val):
        print("\nOptimizing XGBoost on CPU...")
        start_time = time.time()
        
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        param_grid = {
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.3],
            'n_estimators': [100, 200, 500],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.8, 0.9, 1.0]
        }
        
        best_score = 0
        best_params = None
        results = []
        
        total_combinations = (len(param_grid['max_depth']) * len(param_grid['learning_rate']) * 
                            len(param_grid['n_estimators']) * len(param_grid['min_child_weight']) * 
                            len(param_grid['subsample']))
        print(f"Total parameter combinations to try: {total_combinations}")
        
        current_combination = 0
        
        for max_depth in param_grid['max_depth']:
            for lr in param_grid['learning_rate']:
                for n_estimators in param_grid['n_estimators']:
                    for min_child_weight in param_grid['min_child_weight']:
                        for subsample in param_grid['subsample']:
                            current_combination += 1
                            combination_start_time = time.time()
                            
                            params = {
                                'max_depth': max_depth,
                                'learning_rate': lr,
                                'n_estimators': n_estimators,
                                'min_child_weight': min_child_weight,
                                'subsample': subsample,
                                'tree_method': 'hist',
                                'objective': 'binary:logistic',
                                'eval_metric': 'auc'
                            }
                            
                            model = xgb.train(
                                params,
                                dtrain,
                                num_boost_round=n_estimators,
                                evals=[(dval, 'eval')],
                                early_stopping_rounds=50,
                                verbose_eval=False
                            )
                            
                            score = model.best_score
                            results.append({
                                'params': params,
                                'score': score
                            })
                            
                            combination_time = time.time() - combination_start_time
                            print(f"\nCombination {current_combination}/{total_combinations}")
                            print(f"Parameters: {params}")
                            print(f"Score: {score:.4f}")
                            print(f"Time taken: {combination_time:.2f} seconds")
                            
                            if score > best_score:
                                best_score = score
                                best_params = params
                                print("New best score found!")
        
        total_time = time.time() - start_time
        print(f"\nTotal optimization time: {total_time:.2f} seconds")
        return best_params, results

    def run(self, train_path, test_path, submission_path=None, target_column='Depression'):
        print("Loading and preparing data...")
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        X = train_df.drop([target_column, 'id', 'Name'], axis=1)
        y = train_df[target_column]
        X_test = test_df.drop(['id', 'Name'], axis=1)
        
        categorical_columns = X.select_dtypes(include=['object']).columns
        for col in categorical_columns:
            X[col] = X[col].astype('category').cat.codes
            X_test[col] = X_test[col].astype('category').cat.codes
        
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=self.random_state
        )
        
        best_params, optimization_results = self.optimize_model(X_train, y_train, X_val, y_val)
        
        print("\nTraining final model with best parameters...")
        final_model = xgb.XGBClassifier(
            **best_params,
            random_state=self.random_state
        )
        
        final_model.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=50,
            verbose=False
        )
        
        val_predictions = final_model.predict(X_val)
        validation_accuracy = accuracy_score(y_val, val_predictions)
        full_report = classification_report(y_val, val_predictions)
        
        print("\nValidation Results:")
        print(full_report)
        
        print("\n" + "="*50)
        print("BEST MODEL SUMMARY")
        print("="*50)
        print("\nBest Parameters:")
        for param, value in best_params.items():
            if param not in ['tree_method', 'objective', 'eval_metric']:
                print(f"{param}: {value}")
        
        print(f"\nBest Model Performance:")
        print(f"Accuracy: {validation_accuracy:.4f}")
        
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': final_model.feature_importances_
        })
        feature_importance = feature_importance.sort_values('importance', ascending=False)
        
        print("\nTop 10 Most Important Features:")
        for idx, row in feature_importance.head(10).iterrows():
            print(f"{row['feature']}: {row['importance']:.4f}")
        
        print("\n" + "="*50)
        
        if submission_path:
            test_predictions = final_model.predict(X_test)
            submission_df = pd.DataFrame({
                'id': test_df['id'],
                'Depression': test_predictions
            })
            submission_df.to_csv(submission_path, index=False)
            print(f"\nSubmission file saved to: {submission_path}")
        
        return {
            'model': final_model,
            'best_params': best_params,
            'optimization_results': optimization_results
        }

if __name__ == "__main__":
    pipeline = XGBoostCPUPipeline(random_state=42)
    results = pipeline.run(
        train_path='/kaggle/input/playground-series-s4e11/train.csv',
        test_path='/kaggle/input/playground-series-s4e11/test.csv',
        submission_path='submission.csv'
    )


Loading and preparing data...

Optimizing XGBoost on CPU...
Total parameter combinations to try: 243

Combination 1/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.8, 'tree_method': 'hist', 'objective': 'binary:logistic', 'eval_metric': 'auc'}
Score: 0.9599
Time taken: 0.96 seconds
New best score found!

Combination 2/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.9, 'tree_method': 'hist', 'objective': 'binary:logistic', 'eval_metric': 'auc'}
Score: 0.9599
Time taken: 0.85 seconds
New best score found!

Combination 3/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 1.0, 'tree_method': 'hist', 'objective': 'binary:logistic', 'eval_metric': 'auc'}
Score: 0.9598
Time taken: 0.83 seconds

Combination 4/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weigh

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import time
import warnings

warnings.filterwarnings("ignore")  # Suppress all warnings for cleaner output

class XGBoostGPUPipeline:
    """GPU-accelerated XGBoost pipeline for depression prediction."""
    
    def __init__(self, random_state=42, gpu_device=0):
        """
        Initialize the XGBoostGPUPipeline with random state and GPU device.
        
        Args:
            random_state (int, optional): Seed for reproducibility. Defaults to 42.
            gpu_device (int, optional): GPU device ID to use. Defaults to 0.
        """
        self.random_state = random_state  # Set the random seed for reproducibility
        self.gpu_device = gpu_device      # Specify which GPU device to use for training
        
    def optimize_model(self, X_train, y_train, X_val, y_val):
        """
        Optimize XGBoost parameters using GPU acceleration.
        
        Args:
            X_train (pd.DataFrame): Training feature data.
            y_train (pd.Series): Training target data.
            X_val (pd.DataFrame): Validation feature data.
            y_val (pd.Series): Validation target data.
        
        Returns:
            tuple: Best parameters and all optimization results.
        """
        print("\nOptimizing XGBoost on GPU...")
        start_time = time.time()  # Record the start time
        
        # Convert training and validation data into DMatrix format for efficient training on GPU
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        # Define a grid of hyperparameters to search over
        param_grid = {
            'max_depth': [3, 6, 9],
            'learning_rate': [0.01, 0.1, 0.3],
            'n_estimators': [100, 200, 500],
            'min_child_weight': [1, 3, 5],
            'subsample': [0.8, 0.9, 1.0]
        }
        
        best_score = 0      # Initialize the best score
        best_params = None  # Initialize the best parameters
        results = []        # List to store results of each parameter combination
        
        # Calculate the total number of parameter combinations
        total_combinations = len(param_grid['max_depth']) * len(param_grid['learning_rate']) * \
                             len(param_grid['n_estimators']) * len(param_grid['min_child_weight']) * \
                             len(param_grid['subsample'])
                             
        print(f"Total parameter combinations to try: {total_combinations}")
        current_combination = 0  # Initialize the current combination counter
        
        # Iterate over all possible combinations of hyperparameters
        for max_depth in param_grid['max_depth']:
            for lr in param_grid['learning_rate']:
                for n_estimators in param_grid['n_estimators']:
                    for min_child_weight in param_grid['min_child_weight']:
                        for subsample in param_grid['subsample']:
                            current_combination += 1              # Increment combination counter
                            combination_start_time = time.time()  # Record the start time for this combination
                            
                            # Define the current set of parameters
                            params = {
                                'max_depth': max_depth,
                                'learning_rate': lr,
                                'n_estimators': n_estimators,
                                'min_child_weight': min_child_weight,
                                'subsample': subsample,
                                'tree_method': 'gpu_hist',       # Use GPU-accelerated histogram algorithm
                                'objective': 'binary:logistic',  # Binary classification objective
                                'eval_metric': 'auc',            # Evaluation metric: Area Under the Curve
                                'gpu_id': self.gpu_device        # Specify the GPU device ID
                            }
                            
                            # Train the XGBoost model with the current set of parameters
                            model = xgb.train(
                                params,
                                dtrain,
                                num_boost_round=n_estimators,
                                evals=[(dval, 'eval')],    # Evaluation dataset
                                early_stopping_rounds=50,  # Stop if no improvement after 50 rounds
                                verbose_eval=False         # Do not print training progress
                            )
                            
                            # Retrieve the best score achieved during training
                            score = model.best_score
                            results.append({
                                'params': params,  # Store the parameters
                                'score': score     # Store the corresponding score
                            })
                            
                            combination_time = time.time() - combination_start_time  # Calculate time taken for this combination
                            
                            # Print details of the current combination
                            print(f"\nCombination {current_combination}/{total_combinations}")
                            print(f"Parameters: {params}")
                            print(f"Score: {score:.4f}")
                            print(f"Time taken: {combination_time:.2f} seconds")
                            
                            # Update best parameters if the current score is better
                            if score > best_score:
                                best_score = score    # Update the best score
                                best_params = params  # Update the best parameters
                                print("New best score found!")
        
        total_time = time.time() - start_time  # Calculate the total optimization time
        print(f"\nTotal optimization time: {total_time:.2f} seconds")  # Print the total time taken
        
        return best_params, results  # Return the best parameters and all results
    
    def run(self, train_path, test_path, submission_path=None, target_column='Depression'):
        """
        Execute the complete GPU-accelerated XGBoost pipeline.
        
        Args:
            train_path (str): Path to the training CSV file.
            test_path (str): Path to the test CSV file.
            submission_path (str, optional): Path to save the submission CSV file. Defaults to None.
            target_column (str, optional): Name of the target column. Defaults to 'Depression'.
        
        Returns:
            dict: Dictionary containing the trained model, best parameters, and optimization results.
        """
        print("Loading and preparing data...")  # Notify about data loading
        train_df = pd.read_csv(train_path)      # Load training data
        test_df = pd.read_csv(test_path)        # Load test data
        
        # Prepare features by dropping unnecessary columns
        X = train_df.drop([target_column, 'id', 'Name'], axis=1)  # Features for training
        y = train_df[target_column]                               # Target variable
        X_test = test_df.drop(['id', 'Name'], axis=1)             # Features for testing
        
        # Identify categorical columns and convert them to numerical codes
        categorical_columns = X.select_dtypes(include=['object']).columns  # Select categorical columns
        for col in categorical_columns:
            X[col] = X[col].astype('category').cat.codes            # Encode categorical variables in training data
            X_test[col] = X_test[col].astype('category').cat.codes  # Encode categorical variables in test data
        
        # Split the training data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=self.random_state
        )
        
        # Optimize model hyperparameters using the training and validation sets
        best_params, optimization_results = self.optimize_model(X_train, y_train, X_val, y_val)
        
        print("\nTraining final model with best parameters...")
        # Initialize the final XGBoost classifier with the best found parameters
        final_model = xgb.XGBClassifier(
            **best_params,
            random_state=self.random_state  # Set the random seed for reproducibility
        )
        
        # Train the final model on the training data
        final_model.fit(
            X_train, 
            y_train,
            eval_set=[(X_val, y_val)],  # Use validation set for early stopping
            early_stopping_rounds=50,   # Stop if no improvement after 50 rounds
            verbose=False               # Do not print training progress
        )
        
        # Generate validation predictions
        val_predictions = final_model.predict(X_val)  # Predict on validation data
        
        # Calculate various metrics
        validation_accuracy = accuracy_score(y_val, val_predictions)  # Calculate accuracy
        full_report = classification_report(y_val, val_predictions)   # Generate a detailed classification report
        
        # Print validation results
        print("\nValidation Results:")
        print(full_report)  # Print the classification report
        
        # Print final model summary
        print("\n" + "="*50)
        print("BEST MODEL SUMMARY")
        print("="*50)
        print("\nBest Parameters:")
        for param, value in best_params.items():
            if param not in ['tree_method', 'objective', 'eval_metric', 'gpu_id']:
                print(f"{param}: {value}")
        
        print(f"\nBest Model Performance:")
        print(f"Accuracy: {validation_accuracy:.4f}")                # Print the accuracy score
        print(f"AUC Score: {best_params.get('best_score', 'N/A')}")  # Attempt to print the AUC score if available
        
        # Feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,                           # Feature names
            'importance': final_model.feature_importances_  # Corresponding feature importances
        })
        feature_importance = feature_importance.sort_values('importance', ascending=False)  # Sort features by importance
        
        print("\nTop 10 Most Important Features:")
        for idx, row in feature_importance.head(10).iterrows():
            print(f"{row['feature']}: {row['importance']:.4f}")  # Print the top 10 features with their importance scores
            
        print("\n" + "="*50)
        
        # Generate submission file if path is provided
        if submission_path:
            test_predictions = final_model.predict(X_test)  # Predict on test data
            submission_df = pd.DataFrame({
                'id': test_df['id'],            # Include the ID from the test data
                'Depression': test_predictions  # Include the predicted Depression values
            })
            submission_df.to_csv(submission_path, index=False)       # Save the submission file as CSV
            print(f"\nSubmission file saved to: {submission_path}")  # Notify about the saved submission
        
        return {
            'model': final_model,                         # Return the trained model
            'best_params': best_params,                   # Return the best hyperparameters
            'optimization_results': optimization_results  # Return all optimization results
        }

# Example usage
if __name__ == "__main__":
    # Initialize the XGBoostGPUPipeline with a random state for reproducibility and specify GPU device 0
    pipeline = XGBoostGPUPipeline(random_state=42, gpu_device=0)
    
    # Execute the pipeline with specified paths for training, testing, and submission
    results = pipeline.run(
        train_path='/kaggle/input/playground-series-s4e11/train.csv',  # Path to the training data
        test_path='/kaggle/input/playground-series-s4e11/test.csv',    # Path to the test data
        submission_path='submission.csv'                               # Path to save the submission file
    )

Loading and preparing data...

Optimizing XGBoost on GPU...
Total parameter combinations to try: 243

Combination 1/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.8, 'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gpu_id': 0}
Score: 0.9600
Time taken: 0.65 seconds
New best score found!

Combination 2/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.9, 'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gpu_id': 0}
Score: 0.9599
Time taken: 0.23 seconds

Combination 3/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 1.0, 'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gpu_id': 0}
Score: 0.9598
Time taken: 0.22 seconds

Combination 4/243
Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estim