In [None]:
# Import the needed libraries 
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Load data
data = pd.read_csv('/content/data.csv', low_memory=False)

In [None]:
# Function to drop columns that have the same value across all rows
def remove_infrequent_categories(data, threshold=0.05):
    filtered_data = data.copy()
    categorical_columns = filtered_data.select_dtypes(include='object').columns
    categorical_columns = [column for column in categorical_columns if column.endswith('_mut')]

    for column in categorical_columns:
        filtered_data = filtered_data.loc[filtered_data[column].isin(filtered_data[column].value_counts().index[filtered_data[column].value_counts()/len(filtered_data) > threshold])]
    return filtered_data

def drop_single_class_columns(df):
    unique_value_counts = df.nunique()
    single_value_columns = unique_value_counts[unique_value_counts == 1].index
    return df.drop(columns=single_value_columns)

# Function to one-hot encode specified categorical columns
def one_hot_encode_columns(df, columns, encoder, isTrain):
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False)
        encoded_data = encoder.fit_transform(df[columns])
    else:
        encoded_data = encoder.transform(df[columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns))
    return df.drop(columns, axis=1).join(encoded_df), encoder

# Main preprocessing function
def data_preprocess(df, encoder=None, isTrain=True):
    df = df.drop(['patient_id', 'cohort'], axis=1)
    df = drop_single_class_columns(df)
    
    # Fill missing values for numerical columns
    numerical_columns = ['neoplasm_histologic_grade', 'mutation_count', 'tumor_size', 'tumor_stage']
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # Identify and fill missing values for all other numerical columns just in case
    other_numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(numerical_columns)
    df[other_numerical_columns] = df[other_numerical_columns].fillna(df[other_numerical_columns].median())

    # Handle missing values and encode categorical variables
    categorical_columns = ['pr_status', 'pam50_+_claudin-low_subtype', 'primary_tumor_laterality', 'inferred_menopausal_state', 'her2_status', 'er_status', 'er_status_measured_by_ihc', '3-gene_classifier_subtype', 'death_from_cancer']
    for column in categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # Ensure all other categorical columns are also filled with the most frequent value
    other_categorical_columns = df.select_dtypes(include=['object', 'category']).columns.difference(categorical_columns)
    for column in other_categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # One-hot encode categorical variables
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_data = encoder.fit_transform(df[categorical_columns])
    else:
        encoded_data = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))
    df = df.drop(categorical_columns, axis=1).join(encoded_df)

    # Handling 'cellularity' with predefined mapping and filling NaNs
    if 'cellularity' in df.columns:
        df['cellularity'] = df['cellularity'].str.strip()  # Strip whitespace
        mapping = {
            'Low': 1,
            'Moderate': 2,
            'High': 3,
        }
        df['cellularity'] = df['cellularity'].map(mapping).fillna(3)  # Filling NaNs with 'High' assumed as 3

    # Handling 'her2_status_measured_by_snp6' with predefined mapping and dropping rows with 'UNDEF'
    if 'her2_status_measured_by_snp6' in df.columns:
        df = df[df['her2_status_measured_by_snp6'] != 'UNDEF']
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].str.strip()
        her2_mapping = {
            'LOSS': -1,
            'NEUTRAL': 0,
            'GAIN': 1,
        }
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].map(her2_mapping)

    # Label encoding for the target variable
    label_encoder = LabelEncoder()
    y = df.pop("cancer_type")
    y_encoded = label_encoder.fit_transform(y) if isTrain else label_encoder.transform(y)
    y_binary = (y_encoded == 0).astype(int)

    return df, y_encoded, y_binary, label_encoder, encoder

In [None]:
# Preprocess data
X, y, y_binary, label_encoder, encoder = data_preprocess(data, isTrain=True)

In [None]:
# Split data into train, validate, test sets
X_train, X_temp, y_train, y_temp, y_train_binary, y_temp_binary = train_test_split(X, y, y_binary, test_size=0.4, random_state=42)

In [None]:
# Train binary classification model
# Split the training data for binary classification
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X_train, y_train_binary, test_size=0.2, random_state=42)

# Apply Random Over Sampler to the training data
ros = RandomOverSampler(random_state=23)
X_train_ros, y_train_ros = ros.fit_resample(X_train_binary, y_train_binary)

# Adjusting the scale_pos_weight parameter for XGBoost based on class distribution
scale_pos_weight = sum(y_train_ros == 0) / sum(y_train_ros == 1)

# Convert all columns of type 'object' to 'category' 
categorical_columns = X_train_ros.select_dtypes(include=['object']).columns
X_train_ros[categorical_columns] = X_train_ros[categorical_columns].astype('category')
X_test_binary[categorical_columns] = X_test_binary[categorical_columns].astype('category')

# Create DMatrix for train and test sets
dtrain_binary = xgb.DMatrix(X_train_ros, label=y_train_ros, enable_categorical=True)
dtest_binary = xgb.DMatrix(X_test_binary, label=y_test_binary, enable_categorical=True)

# Specify parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'num_class': 1,
    'device':'cuda',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1,
    'alpha': 0.01,
    'scale_pos_weight': scale_pos_weight,  # Applying class weight
    'num_parallel_tree':5,
}

# Train the model
binary_model = xgb.train(params, dtrain_binary, num_boost_round=100)

# Make predictions
y_pred_binary = binary_model.predict(dtest_binary)
y_pred_binary = np.round(y_pred_binary)  # Convert probabilities to binary output

# Evaluate model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Binary Model Accuracy:", accuracy_score(y_test_binary, y_pred_binary))
print("Binary Model Confusion Matrix:\n", confusion_matrix(y_test_binary, y_pred_binary))
print("Binary Model Classification Report:\n", classification_report(y_test_binary, y_pred_binary))

In [None]:
# Prepare and train multi-class model
# Filter and prepare multiclass datasets
X_multiclass = X[y != 0]
y_multiclass = y[y != 0]

# Split the data
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multiclass, y_multiclass, test_size=0.2, random_state=42)

# Re-apply LabelEncoder to the split data
label_encoder = LabelEncoder()
y_train_multi = label_encoder.fit_transform(y_train_multi)
y_test_multi = label_encoder.transform(y_test_multi)

# Convert these columns to 'category' in both training and testing datasets
object_columns = X_train_multi.select_dtypes(include=['object']).columns
X_train_multi[object_columns] = X_train_multi[object_columns].astype('category')
X_test_multi[object_columns] = X_test_multi[object_columns].astype('category')

# Create DMatrix
dtrain_multi = xgb.DMatrix(X_train_multi, label=y_train_multi, enable_categorical=True)
dtest_multi = xgb.DMatrix(X_test_multi, label=y_test_multi, enable_categorical=True)

# Specify parameters
params_multi = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'rmse',
    'objective': 'multi:softmax',
    'device':'cuda',
    'learning_rate': 0.1,
    'max_depth': 15,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1.5,
    'alpha': 0.01,
    'num_parallel_tree':5
}

# Train the model
multi_class_model = xgb.train(params_multi, dtrain_multi, num_boost_round=100)

# Make predictions
y_pred_multi = multi_class_model.predict(dtest_multi)

# Evaluate model
print("Multi-Class Model Accuracy:", accuracy_score(y_test_multi, y_pred_multi))
print("Multi-Class Model Confusion Matrix:\n", confusion_matrix(y_test_multi, y_pred_multi))
print("Multi-Class Model Classification Report:\n", classification_report(y_test_multi, y_pred_multi))