In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

print("1. Required libraries imported.")

# Load data
print("2. Loading data...")
data = pd.read_csv('train.csv')
print("Data loaded.")

# Extract features and target variables
print("3. Extracting features and target variables...")
X = data.drop(['id', 'target'], axis=1)
y = data['target']
print("Features and target variables extracted.")

# Define numeric and categorical feature names
print("4. Defining numeric and categorical feature names...")
numeric_features = [f'num_{i}' for i in range(38)]
categorical_features = [f'cat_{i}' for i in range(20)]
print("Feature names defined.")

# Split the data into training and testing sets
print("5. Splitting the data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets.")

# Define preprocessing steps
print("6. Defining preprocessing steps...")
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
print("Preprocessing steps defined.")

# Define the AdaBoosting model pipeline
print("7. Defining the AdaBoosting model pipeline...")
adaboost_model = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', AdaBoostClassifier(n_estimators=50, algorithm='SAMME', random_state=42))])
print("Model pipeline defined.")

# Record training time and train the model
print("8. Training the model...")
start_time = time.time()
adaboost_model.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print("Model training completed.")

# Make predictions
print("9. Making predictions...")
y_pred_proba = adaboost_model.predict_proba(X_test)[:, 1]
print("Predictions made.")

# Evaluate the model
print("10. Evaluating the model...")
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AdaBoosting AUC: {auc_score:.4f}")
print(f"Training Time (seconds): {training_time:.4f}")



1. Required libraries imported.
2. Loading data...
Data loaded.
3. Extracting features and target variables...
Features and target variables extracted.
4. Defining numeric and categorical feature names...
Feature names defined.
5. Splitting the data into training and testing sets...
Data split into training and testing sets.
6. Defining preprocessing steps...
Preprocessing steps defined.
7. Defining the AdaBoosting model pipeline...
Model pipeline defined.
8. Training the model...
Model training completed.
9. Making predictions...
Predictions made.
10. Evaluating the model...
AdaBoosting AUC: 0.8814
Training Time (seconds): 197.2767
