In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Co-training setup: Create two distinct feature sets for the two models
X_train_1 = X_train[:, :X_train.shape[1] // 2]  # First half of features
X_train_2 = X_train[:, X_train.shape[1] // 2:]  # Second half of features

# Also split the test set in the same way
X_test_1 = X_test[:, :X_test.shape[1] // 2]  # First half of features
X_test_2 = X_test[:, X_test.shape[1] // 2:]  # Second half of features

# Define parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.01],
    'reg_lambda': [1, 0.1],
    'scale_pos_weight': [1, 10]
}

# Initialize XGBoost classifiers for co-training
model_1 = XGBClassifier(
    objective='binary:logistic',  # For binary classification
    tree_method='gpu_hist',       # GPU optimized
    use_label_encoder=False,
    verbosity=1,
    predictor='gpu_predictor'
)

model_2 = XGBClassifier(
    objective='binary:logistic',  # For binary classification
    tree_method='gpu_hist',       # GPU optimized
    use_label_encoder=False,
    verbosity=1,
    predictor='gpu_predictor'
)

# Function to perform co-training with batch processing and hyperparameter tuning
def co_train(X_train_1, X_train_2, y_train, model_1, model_2, param_grid, n_iterations=10, batch_size=500):
    n_batches = len(X_train_1) // batch_size + (1 if len(X_train_1) % batch_size != 0 else 0)
    
    # Perform grid search for model 1
    grid_search_1 = GridSearchCV(estimator=model_1, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search_1.fit(X_train_1, y_train)
    best_model_1 = grid_search_1.best_estimator_
    print(f"Best Hyperparameters for Model 1: {grid_search_1.best_params_}")
    
    # Perform grid search for model 2
    grid_search_2 = GridSearchCV(estimator=model_2, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search_2.fit(X_train_2, y_train)
    best_model_2 = grid_search_2.best_estimator_
    print(f"Best Hyperparameters for Model 2: {grid_search_2.best_params_}")

    # Co-training loop with batch processing
    for iteration in range(n_iterations):
        print(f"Iteration {iteration + 1}/{n_iterations}")
        
        for batch_idx in tqdm(range(n_batches), desc="Training Batches", unit="batch"):
            # Get the start and end indices for this batch
            start_idx = batch_idx * batch_size
            end_idx = min((batch_idx + 1) * batch_size, len(X_train_1))

            # Slice the training data for this batch
            X_batch_1 = X_train_1[start_idx:end_idx]
            X_batch_2 = X_train_2[start_idx:end_idx]
            y_batch = y_train[start_idx:end_idx]

            # Train the best models from the grid search on the current batch
            best_model_1.fit(X_batch_1, y_batch)
            best_model_2.fit(X_batch_2, y_batch)
        
        # Get predictions from both models
        pred_1 = best_model_1.predict(X_train_2)  # Model 1 predicts for the second view
        pred_2 = best_model_2.predict(X_train_1)  # Model 2 predicts for the first view
        
        # Select high-confidence predictions from each model and add them to the labeled set
        confident_idx_1 = (pred_1 == y_train)  # Confident predictions by model 1
        confident_idx_2 = (pred_2 == y_train)  # Confident predictions by model 2
        
        # Update the labeled data with confident predictions
        X_train_1 = np.vstack([X_train_1[confident_idx_1], X_train_1[confident_idx_2]])
        X_train_2 = np.vstack([X_train_2[confident_idx_1], X_train_2[confident_idx_2]])
        y_train = np.hstack([y_train[confident_idx_1], y_train[confident_idx_2]])
    
    return best_model_1, best_model_2, X_train_1, X_train_2, y_train

# Train the models with co-training, batch processing, and hyperparameter tuning
model_1, model_2, X_train_1, X_train_2, y_train = co_train(X_train_1, X_train_2, y_train, model_1, model_2, param_grid)

# Evaluate the models on the test set
y_pred_1 = model_1.predict(X_test_2)  # Model 1 predicts for X_test_2
y_pred_2 = model_2.predict(X_test_1)  # Model 2 predicts for X_test_1

# Majority voting from both models
final_pred = [1 if y_pred_1[i] == y_pred_2[i] else 0 for i in range(len(y_pred_1))]

# Evaluate the final accuracy
accuracy = accuracy_score(y_test, final_pred)
print(f"Final Model Accuracy after Co-training, Batch Processing, and Hyperparameter Tuning: {accuracy:.4f}")


Fitting 3 folds for each of 5184 candidates, totalling 15552 fits


KeyboardInterrupt: 