In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier

# Load data
data = pd.read_csv('/content/data.csv', low_memory=False)

# Function to drop columns that have the same value across all rows
def remove_infrequent_categories(data, threshold=0.05):
    filtered_data = data.copy()
    categorical_columns = filtered_data.select_dtypes(include='object').columns
    categorical_columns = [column for column in categorical_columns if column.endswith('_mut')]

    for column in categorical_columns:
        filtered_data = filtered_data.loc[filtered_data[column].isin(filtered_data[column].value_counts().index[filtered_data[column].value_counts()/len(filtered_data) > threshold])]
    return filtered_data

def drop_single_class_columns(df):
    unique_value_counts = df.nunique()
    single_value_columns = unique_value_counts[unique_value_counts == 1].index
    return df.drop(columns=single_value_columns)

# Function to one-hot encode specified categorical columns
def one_hot_encode_columns(df, columns, encoder, isTrain):
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False)
        encoded_data = encoder.fit_transform(df[columns])
    else:
        encoded_data = encoder.transform(df[columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns))
    return df.drop(columns, axis=1).join(encoded_df), encoder

# Main preprocessing function
def data_preprocess(df, encoder=None, isTrain=True):

    df = drop_single_class_columns(df)

    # Fill missing values for numerical columns
    numerical_columns = ['neoplasm_histologic_grade', 'mutation_count', 'tumor_size', 'tumor_stage']
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # Identify and fill missing values for all other numerical columns just in case
    other_numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(numerical_columns)
    df[other_numerical_columns] = df[other_numerical_columns].fillna(df[other_numerical_columns].median())

    # Handle missing values and encode categorical variables
    categorical_columns = ['pr_status', 'pam50_+_claudin-low_subtype', 'primary_tumor_laterality', 'inferred_menopausal_state', 'her2_status', 'er_status', 'er_status_measured_by_ihc', '3-gene_classifier_subtype', 'death_from_cancer']
    for column in categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # Ensure all other categorical columns are also filled with the most frequent value
    other_categorical_columns = df.select_dtypes(include=['object', 'category']).columns.difference(categorical_columns)
    for column in other_categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # One-hot encode categorical variables
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_data = encoder.fit_transform(df[categorical_columns])
    else:
        encoded_data = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))
    df = df.drop(categorical_columns, axis=1).join(encoded_df)

    # Handling 'cellularity' with predefined mapping and filling NaNs
    if 'cellularity' in df.columns:
        df['cellularity'] = df['cellularity'].str.strip()  # Strip whitespace
        mapping = {
            'Low': 1,
            'Moderate': 2,
            'High': 3,
        }
        df['cellularity'] = df['cellularity'].map(mapping).fillna(2)  # Filling NaNs with 'Moderate' assumed as 2

    # Handling 'her2_status_measured_by_snp6' with predefined mapping and dropping rows with 'UNDEF'
    if 'her2_status_measured_by_snp6' in df.columns:
        df = df[df['her2_status_measured_by_snp6'] != 'UNDEF']
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].str.strip()
        her2_mapping = {
            'LOSS': -1,
            'NEUTRAL': 0,
            'GAIN': 1,
        }
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].map(her2_mapping)

    # Label encoding for the target variable
    label_encoder = LabelEncoder()
    y = df.pop("cancer_type")
    y_encoded = label_encoder.fit_transform(y) if isTrain else label_encoder.transform(y)
    y_binary = (y_encoded == 0).astype(int)

    return df, y_encoded, y_binary, label_encoder, encoder

# Preprocess data
X, y, y_binary, label_encoder, encoder = data_preprocess(data.copy(), isTrain=True)

# Split data into train, validate, test sets
X_train, X_temp, y_train, y_temp, y_train_binary, y_temp_binary = train_test_split(X, y, y_binary, test_size=0.4, random_state=42)

# Train binary classification model
# Split the training data for binary classification
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_train, y_train_binary, test_size=0.2, random_state=42)

# Apply Random Over Sampler to the training data
ros = RandomOverSampler(random_state=23)
X_train_ros, y_train_ros = ros.fit_resample(X_train_binary, y_train_binary)

# Adjusting the scale_pos_weight parameter for XGBoost based on class distribution
scale_pos_weight = sum(y_train_ros == 0) / sum(y_train_ros == 1)

# Convert all columns of type 'object' to 'category'
categorical_columns = X_train_ros.select_dtypes(include=['object']).columns
X_train_ros[categorical_columns] = X_train_ros[categorical_columns].astype('category')
X_test_binary[categorical_columns] = X_test_binary[categorical_columns].astype('category')

# Create DMatrix for train and test sets
dtrain_binary = xgb.DMatrix(X_train_ros, label=y_train_ros, enable_categorical=True)
dtest_binary = xgb.DMatrix(X_test_binary, label=y_test_binary, enable_categorical=True)

# Specify parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'num_class': 1,
    'device':'cuda',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1,
    'alpha': 0.01,
    'scale_pos_weight': scale_pos_weight,  # Applying class weight
    'num_parallel_tree':5,
}

# Train the model
binary_model = xgb.train(params, dtrain_binary, num_boost_round=100)

# Make predictions
y_pred_binary = binary_model.predict(dtest_binary)
y_pred_binary = np.round(y_pred_binary)  # Convert probabilities to binary output

# Evaluate model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Binary Model Accuracy:", accuracy_score(y_test_binary, y_pred_binary))
print("Binary Model Confusion Matrix:\n", confusion_matrix(y_test_binary, y_pred_binary))
print("Binary Model Classification Report:\n", classification_report(y_test_binary, y_pred_binary))

# Prepare and train multi-class model
# Apply preprocessing and ensure the target reflects 'cellularity' correctly
processed_data, y_encoded, y_binary, label_encoder, encoder = data_preprocess(data.copy(), isTrain=True)

# Reset indices of processed_data to ensure alignment
processed_data.reset_index(drop=True, inplace=True)
y_encoded = pd.Series(y_encoded) 

# Assuming 'cellularity' should map directly to the target
if 'cellularity' in processed_data.columns:
    # Ensure no NaN values or unexpected categories before encoding
    print("Unique values in cellularity before encoding:", processed_data['cellularity'].unique())

    # Directly using 'cellularity' as the target if not already doing so
    y = processed_data['cellularity'].copy()

    # Using LabelEncoder to encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Ensure the encoder is fitting expected classes
    print("Classes found by LabelEncoder:", label_encoder.classes_)

# Filter data
filtered_data = processed_data[processed_data['cellularity'].isin([1, 2, 3])]
filtered_y_encoded = y_encoded[filtered_data.index]

# Confirming the filtered targets
print("Unique y values after filtering:", np.unique(filtered_y_encoded))

# Proceed with train-test split and resampling
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    filtered_data, filtered_y_encoded, test_size=0.2, random_state=42)

# Resample the training data
ros_multi = RandomOverSampler(random_state=42)
X_train_multi_ros, y_train_multi_ros = ros_multi.fit_resample(X_train_multi, y_train_multi)

# Recheck the unique values in y_train_multi_ros
print("Unique y values in training data after resample:", np.unique(y_train_multi_ros))

# Adjusting the scale_pos_weight parameter for XGBoost based on class distribution
scale_pos_weight_multi = {}
for class_label in np.unique(y_train_multi_ros):
    scale_pos_weight_multi[class_label] = sum(y_train_multi_ros == class_label) / sum(y_train_multi_ros != class_label)

# Convert object columns to category type
object_columns = X_train_multi_ros.select_dtypes(include=['object']).columns
if not object_columns.empty:
    X_train_multi_ros[object_columns] = X_train_multi_ros[object_columns].astype('category')
    X_test_multi[object_columns] = X_test_multi[object_columns].astype('category')

# Create DMatrix for train and test sets, ensuring no conversion issues
dtrain_multi = xgb.DMatrix(X_train_multi_ros, label=y_train_multi_ros, enable_categorical=True)
dtest_multi = xgb.DMatrix(X_test_multi, label=y_test_multi, enable_categorical=True)

print(np.unique(y_train_multi_ros))

# Specify parameters
params_multi = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'rmse',
    'objective': 'multi:softmax',
    'device':'cuda',
    'learning_rate': 0.1,
    'max_depth': 15,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1.5,
    'alpha': 0.01,
    'num_parallel_tree':5
}

# Train the model
multi_class_model = xgb.train(params_multi, dtrain_multi, num_boost_round=100)

# Make predictions
y_pred_multi = multi_class_model.predict(dtest_multi)

# Evaluate model
print("Multi-Class Model Accuracy:", accuracy_score(y_test_multi, y_pred_multi))
print("Multi-Class Model Confusion Matrix:\n", confusion_matrix(y_test_multi, y_pred_multi))
print("Multi-Class Model Classification Report:\n", classification_report(y_test_multi, y_pred_multi))

In [None]:
# Loading the necessary libraries 
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from xgboost.callback import TrainingCallback

In [None]:
# Section 1 - Data Preprocessing 
# Load the data
data = pd.read_csv('/content/data.csv', low_memory=False)

# Function to drop columns that have the same value across all rows
def remove_infrequent_categories(data, threshold=0.05):
    filtered_data = data.copy()
    categorical_columns = filtered_data.select_dtypes(include='object').columns
    categorical_columns = [column for column in categorical_columns if column.endswith('_mut')]

    for column in categorical_columns:
        filtered_data = filtered_data.loc[filtered_data[column].isin(filtered_data[column].value_counts().index[filtered_data[column].value_counts()/len(filtered_data) > threshold])]
    return filtered_data

def drop_single_class_columns(df):
    unique_value_counts = df.nunique()
    single_value_columns = unique_value_counts[unique_value_counts == 1].index
    return df.drop(columns=single_value_columns)

# Function to one-hot encode specified categorical columns
def one_hot_encode_columns(df, columns, encoder, isTrain):
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False)
        encoded_data = encoder.fit_transform(df[columns])
    else:
        encoded_data = encoder.transform(df[columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns))
    return df.drop(columns, axis=1).join(encoded_df), encoder

# Main preprocessing function
def data_preprocess(df, encoder=None, isTrain=True):
    df = drop_single_class_columns(df)

    # Fill missing values for numerical columns
    numerical_columns = ['neoplasm_histologic_grade', 'mutation_count', 'tumor_size', 'tumor_stage']
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # Identify and fill missing values for all other numerical columns just in case
    other_numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(numerical_columns)
    df[other_numerical_columns] = df[other_numerical_columns].fillna(df[other_numerical_columns].median())

    # Handle missing values and encode categorical variables
    categorical_columns = ['pr_status', 'pam50_+_claudin-low_subtype', 'primary_tumor_laterality', 'inferred_menopausal_state', 'her2_status', 'er_status', 'er_status_measured_by_ihc', '3-gene_classifier_subtype', 'death_from_cancer']
    for column in categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # Ensure all other categorical columns are also filled with the most frequent value
    other_categorical_columns = df.select_dtypes(include=['object', 'category']).columns.difference(categorical_columns)
    for column in other_categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # One-hot encode categorical variables
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_data = encoder.fit_transform(df[categorical_columns])
    else:
        encoded_data = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))
    df = df.drop(categorical_columns, axis=1).join(encoded_df)

    # Handling 'cellularity' with predefined mapping and filling NaNs
    if 'cellularity' in df.columns:
        df['cellularity'] = df['cellularity'].str.strip()  # Strip whitespace
        mapping = {
            'Low': 1,
            'Moderate': 2,
            'High': 3,
        }
        df['cellularity'] = df['cellularity'].map(mapping).fillna(2)  # Filling NaNs with 'Moderate' assumed as 2

    # Handling 'her2_status_measured_by_snp6' with predefined mapping and dropping rows with 'UNDEF'
    if 'her2_status_measured_by_snp6' in df.columns:
        df = df[df['her2_status_measured_by_snp6'] != 'UNDEF']
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].str.strip()
        her2_mapping = {
            'LOSS': -1,
            'NEUTRAL': 0,
            'GAIN': 1,
        }
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].map(her2_mapping)

    # Label encoding for the target variable
    label_encoder = LabelEncoder()
    y = df.pop("cancer_type")
    y_encoded = label_encoder.fit_transform(y) if isTrain else label_encoder.transform(y)
    y_binary = (y_encoded == 0).astype(int)

    return df, y_encoded, y_binary, label_encoder, encoder

# Preprocess the data
X, y, y_binary, label_encoder, encoder = data_preprocess(data.copy(), isTrain=True)

# Split data into train, validate, test sets
X_train, X_temp, y_train, y_temp, y_train_binary, y_temp_binary = train_test_split(X, y, y_binary, test_size=0.4, random_state=42)

In [None]:
# Section 2 - Train binary classification model
# Split the training data for binary classification
X_train_binary, _, y_train_binary, _ = train_test_split(X_train, y_train_binary, test_size=0.2, random_state=42)
X_test_binary, X_val_binary, y_test_binary, y_val_binary = train_test_split(X_temp, y_temp_binary, test_size=0.5, random_state=42)

# Apply Random Over Sampler to the training data
ros = RandomOverSampler(random_state=23)
X_train_ros, y_train_ros = ros.fit_resample(X_train_binary, y_train_binary)

# Adjusting the scale_pos_weight parameter for XGBoost based on class distribution
scale_pos_weight = sum(y_train_ros == 0) / sum(y_train_ros == 1)

# Convert all columns of type 'object' to 'category'
categorical_columns = X_train_ros.select_dtypes(include=['object']).columns
X_train_ros[categorical_columns] = X_train_ros[categorical_columns].astype('category')
X_test_binary[categorical_columns] = X_test_binary[categorical_columns].astype('category')
X_val_binary[categorical_columns] = X_val_binary[categorical_columns].astype('category')

# Create DMatrix for train and test sets
dtrain_binary = xgb.DMatrix(X_train_ros, label=y_train_ros, enable_categorical=True)
dtest_binary = xgb.DMatrix(X_test_binary, label=y_test_binary, enable_categorical=True)
dval_binary = xgb.DMatrix(X_val_binary, label=y_val_binary, enable_categorical=True)

# Specify parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'num_class': 1,
    'device':'cuda',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1,
    'alpha': 0.01,
    'scale_pos_weight': scale_pos_weight,  # Applying class weight
    'num_parallel_tree':5,
}

# Define the evaluation log storage callback, inheriting from TrainingCallback
class EvaluationHistoryCallback(TrainingCallback):
    def __init__(self):
        super().__init__()
        self.evaluation_results = {}

    def after_iteration(self, model, epoch, evals_log):
        if not evals_log:
            return False
        for dataset_name, metric_dict in evals_log.items():
            for metric_name, log in metric_dict.items():
                full_metric_name = f"{dataset_name}-{metric_name}"
                if full_metric_name not in self.evaluation_results:
                    self.evaluation_results[full_metric_name] = []
                self.evaluation_results[full_metric_name].append(log[-1])
        return False  # Return False to continue training

# Initialize callback
eval_history = EvaluationHistoryCallback()

# Train the model with evaluation set and custom callback
eval_set = [(dtrain_binary, 'train'), (dval_binary, 'eval')]
binary_model = xgb.train(params, dtrain_binary, num_boost_round=100, evals=eval_set, early_stopping_rounds=10, verbose_eval=True, callbacks=[eval_history])

# Access the stored evaluation results
results = eval_history.evaluation_results
train_rmse = results.get('train-rmse', [])
eval_rmse = results.get('eval-rmse', [])
epochs = len(train_rmse)
x_axis = range(0, epochs)

# Plot learning curves
fig, ax = plt.subplots()
ax.plot(x_axis, train_rmse, label='Train RMSE')
ax.plot(x_axis, eval_rmse, label='Validation RMSE')
ax.legend()
plt.ylabel('RMSE')
plt.title('Learning Curve for Binary Model')
plt.show()

# Make predictions
y_pred_binary = binary_model.predict(dtest_binary)
y_pred_binary = np.round(y_pred_binary)  # Convert probabilities to binary output

# Evaluate model
print("Binary Model Accuracy:", accuracy_score(y_test_binary, y_pred_binary))
print("Binary Model Confusion Matrix:\n", confusion_matrix(y_test_binary, y_pred_binary))
print("Binary Model Classification Report:\n", classification_report(y_test_binary, y_pred_binary))

In [None]:
# Section 3 - Prepare and train multi-class model
# Apply preprocessing and ensure the target reflects 'cellularity' correctly
processed_data, y_encoded, y_binary, label_encoder, encoder = data_preprocess(data.copy(), isTrain=True)

# Reset indices of processed_data to ensure alignment
processed_data.reset_index(drop=True, inplace=True)
y_encoded = pd.Series(y_encoded) 

# Assuming 'cellularity' should map directly to the target
if 'cellularity' in processed_data.columns:
    # Ensure no NaN values or unexpected categories before encoding
    print("Unique values in cellularity before encoding:", processed_data['cellularity'].unique())

    # Directly using 'cellularity' as the target if not already doing so
    y = processed_data['cellularity'].copy()

    # Using LabelEncoder to encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Ensure the encoder is fitting expected classes
    print("Classes found by LabelEncoder:", label_encoder.classes_)

# Filter data
filtered_data = processed_data[processed_data['cellularity'].isin([1, 2, 3])]
filtered_y_encoded = y_encoded[filtered_data.index]

# Confirming the filtered targets
print("Unique y values after filtering:", np.unique(filtered_y_encoded))

# Proceed with train-test split and resampling
X_train_multi, X_temp_multi, y_train_multi, y_temp_multi = train_test_split(filtered_data, filtered_y_encoded, test_size=0.2, random_state=42)
X_test_multi, X_val_multi, y_test_multi, y_val_multi = train_test_split(X_temp_multi, y_temp_multi, test_size=0.5, random_state=42)

# Resample the training data
ros_multi = RandomOverSampler(random_state=42)
X_train_multi_ros, y_train_multi_ros = ros_multi.fit_resample(X_train_multi, y_train_multi)

# Recheck the unique values in y_train_multi_ros
print("Unique y values in training data after resample:", np.unique(y_train_multi_ros))

# Adjusting the scale_pos_weight parameter for XGBoost based on class distribution
scale_pos_weight_multi = {}
for class_label in np.unique(y_train_multi_ros):
    scale_pos_weight_multi[class_label] = sum(y_train_multi_ros == class_label) / sum(y_train_multi_ros != class_label)

# Convert object columns to category type
object_columns = X_train_multi_ros.select_dtypes(include=['object']).columns
if not object_columns.empty:
    X_train_multi_ros[object_columns] = X_train_multi_ros[object_columns].astype('category')
    X_test_multi[object_columns] = X_test_multi[object_columns].astype('category')
    X_val_multi[object_columns] = X_val_multi[object_columns].astype('category')

print(np.unique(y_train_multi_ros))

# Specify parameters
params_multi = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'mlogloss',  
    'device': 'cuda',
    'learning_rate': 0.1,
    'max_depth': 15,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1.5,
    'alpha': 0.01,
    'num_parallel_tree': 5
}

# Prepare DMatrix for training and validation
dtrain_multi = xgb.DMatrix(X_train_multi_ros, label=y_train_multi_ros, enable_categorical=True)
dval_multi = xgb.DMatrix(X_val_multi, label=y_val_multi, enable_categorical=True)
dtest_multi = xgb.DMatrix(X_test_multi, label=y_test_multi, enable_categorical=True)

# Check the sizes right before training
print("Training size:", dtrain_multi.num_row(), "Labels:", len(y_train_multi_ros))
print("Validation size:", dval_multi.num_row(), "Labels:", len(y_val_multi))

# Initialize callback
eval_history_multi = EvaluationHistoryCallback()

# Train the model with evaluation set and custom callback
eval_set_multi = [(dtrain_multi, 'train'), (dval_multi, 'eval')]
multi_class_model = xgb.train(params_multi, dtrain_multi, num_boost_round=100, evals=eval_set_multi, early_stopping_rounds=10, verbose_eval=True, callbacks=[eval_history_multi])

# Access the stored evaluation results
results_multi = eval_history_multi.evaluation_results
train_mlogloss = results_multi.get('train-mlogloss', [])
eval_mlogloss = results_multi.get('eval-mlogloss', [])
epochs_multi = len(train_mlogloss)
x_axis_multi = range(0, epochs_multi)

# Plot learning curves
fig, ax = plt.subplots()
ax.plot(x_axis_multi, train_mlogloss, label='Train MLogLoss')
ax.plot(x_axis_multi, eval_mlogloss, label='Validation MLogLoss')
ax.legend()
plt.ylabel('MLogLoss')
plt.title('Learning Curve for Multi-Class Model')
plt.show()

# Make predictions
y_pred_multi = multi_class_model.predict(dtest_multi)

# Evaluate model
print("Multi-Class Model Accuracy:", accuracy_score(y_test_multi, y_pred_multi))
print("Multi-Class Model Confusion Matrix:\n", confusion_matrix(y_test_multi, y_pred_multi))
print("Multi-Class Model Classification Report:\n", classification_report(y_test_multi, y_pred_multi))


In [None]:
# Section 4 - Tuning the first Model
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# Possible ranges of hyperparameters
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
}

# Function to perform manual cross-validation
def manual_cv(params, n_splits=3):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, test_index in kf.split(X_train_ros):
        X_train_kf, X_test_kf = X_train_ros.iloc[train_index], X_train_ros.iloc[test_index]
        y_train_kf, y_test_kf = y_train_ros[train_index], y_train_ros[test_index]

        dtrain = xgb.DMatrix(X_train_kf, label=y_train_kf, enable_categorical=True)
        dtest = xgb.DMatrix(X_test_kf, label=y_test_kf, enable_categorical=True)

        model = xgb.train(params, dtrain, num_boost_round=params['num_boost_round'])
        y_pred = model.predict(dtest)
        y_pred_binary = np.round(y_pred)
        auc = roc_auc_score(y_test_kf, y_pred_binary)
        auc_scores.append(auc)

    return np.mean(auc_scores)

# Iterate over combinations of parameters
from itertools import product

best_score = 0
best_params = None

for combination in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combination))
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['scale_pos_weight'] = scale_pos_weight  # Keep the scale_pos_weight from your original setup
    params['tree_method'] = 'gpu_hist'  # Ensure to use GPU if available

    current_score = manual_cv(params)
    if current_score > best_score:
        best_score = current_score
        best_params = params

# Print best parameters and score
print("Best Score:", best_score)
print("Best Parameters:", best_params)

# Train the model with the best parameters
final_dtrain = xgb.DMatrix(X_train_ros, label=y_train_ros, enable_categorical=True)
final_model = xgb.train(best_params, final_dtrain, num_boost_round=best_params['num_boost_round'])

# Evaluate the final model
final_dtest = xgb.DMatrix(X_test_binary, label=y_test_binary)
y_pred_final = final_model.predict(final_dtest)
y_pred_final = np.round(y_pred_final)  # Convert probabilities to binary output

print("Final Model Accuracy:", accuracy_score(y_test_binary, y_pred_final))
print("Final Model Confusion Matrix:\n", confusion_matrix(y_test_binary, y_pred_final))
print("Final Model Classification Report:\n", classification_report(y_test_binary, y_pred_final))