In [1]:
!pip install xgboost
!pip install category_encoders
!pip install imblearn
!pip install optuna

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.1
Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.3.tar.gz (20.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, matthews_corrcoef
from imblearn.pipeline import Pipeline as ImbPipeline
import pandas as pd
import optuna

In [4]:
# Load and prepare data
data = pd.read_csv('train.csv')
X = data.drop(['id', 'class'], axis=1)
y = data['class']
y = y.replace({'e': 0, 'p': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define preprocessor
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

# Define the model
xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)

model_xgb = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

# Create a custom MCC scoring function
mcc_scorer = make_scorer(matthews_corrcoef)

# Perform cross-validation using MCC as the scoring metric
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model_xgb, X, y, cv=cv, scoring=mcc_scorer, n_jobs=-1)

# Output the cross-validation scores
print("MCC Cross-Validation Scores:", scores)
print("Mean MCC Score:", scores.mean())

MCC Cross-Validation Scores: [0.98192941 0.98260967 0.98224662 0.98207806 0.98242717]
Mean MCC Score: 0.9822581875988394


In [6]:
# Ensure numeric_features and categorical_features are defined
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Define the objective function to tune only XGBoost parameters
#[I 2024-09-16 13:49:27,559] Trial 134 finished with value: 0.9842477601950346 and parameters: {'n_estimators': 461, 'learning_rate': 0.21387795504944024, 'max_depth': 6, 'gamma': 0.34484161489059895, 'subsample': 0.7793446727094343, 'colsample_bytree': 0.706874306505037, 'min_child_weight': 1, 'reg_alpha': 0.5577047373813019, 'reg_lambda': 0.8460006802821987, 'scale_pos_weight': 1.1257837010941643}. Best is trial 134 with value: 0.9842477601950346.previous_best_params = {'n_estimators': 360, 'learning_rate': 0.19827232379674248, 'max_depth': 7, 'gamma': 0.20025276140775589, 'subsample': 0.850893742793344, 'colsample_bytree': 0.8041876856864637, 'min_child_weight': 3, 'reg_alpha': 0.6528447892504516, 'reg_lambda': 0.1220255627695831, 'scale_pos_weight': 1.2337291491436182}

def objective(trial):
    try:
        # Tune XGBoost parameters
        xgb_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1e-6, 100),
            'eval_metric': 'logloss'
        }

        # Keep preprocessing pipeline fixed
        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
        ])

        # Define the model pipeline
        model_xgb = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', XGBClassifier(objective='binary:logistic', random_state=42, **xgb_params))
        ])

        # Define MCC scorer
        mcc_scorer = make_scorer(matthews_corrcoef)

        # Perform cross-validation using MCC as the scoring metric
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(model_xgb, X_train, y_train, cv=cv, scoring=mcc_scorer, n_jobs=-1)

        # Return the mean MCC score for Optuna to maximize
        return scores.mean()

    except Exception as e:
        print(f"Trial failed: {e}")
        return None

# Create Optuna study to maximize MCC
study = optuna.create_study(direction='maximize')
study.enqueue_trial(previous_best_params)

# Optimize the study
study.optimize(objective, n_trials=1000, n_jobs=-1)

# Best trial results
print(f"Best trial: {study.best_trial.number}")
print(f"Best value (MCC): {study.best_value}")
print(f"Best hyperparameters: {study.best_trial.params}")

[I 2024-09-16 13:01:09,987] A new study created in memory with name: no-name-d0c7c059-9ed8-45f3-9547-b79f50b5a614
[I 2024-09-16 13:02:53,907] Trial 5 finished with value: 0.4774036168670207 and parameters: {'n_estimators': 84, 'learning_rate': 0.08960271970679161, 'max_depth': 5, 'gamma': 0.9097682158975577, 'subsample': 0.5645222857169869, 'colsample_bytree': 0.87555811022432, 'min_child_weight': 6, 'reg_alpha': 0.6147890772325207, 'reg_lambda': 0.6024241141067966, 'scale_pos_weight': 99.50538545543253}. Best is trial 5 with value: 0.4774036168670207.
[I 2024-09-16 13:03:07,282] Trial 4 finished with value: 0.5314083314545905 and parameters: {'n_estimators': 75, 'learning_rate': 0.23134885641439043, 'max_depth': 3, 'gamma': 0.5997379494524353, 'subsample': 0.8426541820018179, 'colsample_bytree': 0.6435967511813198, 'min_child_weight': 9, 'reg_alpha': 0.5224187741539766, 'reg_lambda': 0.04044385702944227, 'scale_pos_weight': 67.967411413952}. Best is trial 4 with value: 0.5314083314545

Best trial: 585
Best value (MCC): 0.9842520532855058
Best hyperparameters: {'n_estimators': 462, 'learning_rate': 0.18202911706234737, 'max_depth': 7, 'gamma': 0.32481947934960437, 'subsample': 0.8018680012555202, 'colsample_bytree': 0.999451324065874, 'min_child_weight': 10, 'reg_alpha': 0.5562835084224718, 'reg_lambda': 0.4997926133434165, 'scale_pos_weight': 0.8821317215376284}


In [9]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5


In [10]:
import openpyxl

results = []
for trial in study.trials:
    trial_dict = trial.params.copy()  # Get hyperparameters
    trial_dict['value'] = trial.value  # Add the objective value
    trial_dict['trial_number'] = trial.number  # Add the trial number
    results.append(trial_dict)

# Convert to a Pandas DataFrame
df = pd.DataFrame(results)

# Save to an Excel file
df.to_excel('optuna_study_results.xlsx', index=False)

print("Study results saved to 'optuna_study_results.xlsx'")


Study results saved to 'optuna_study_results.xlsx'


In [12]:
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_param_importances(study)

In [16]:
xgb_params = {'n_estimators': 462, 'learning_rate': 0.18202911706234737, 'max_depth': 7, 'gamma': 0.32481947934960437, 'subsample': 0.8018680012555202, 'colsample_bytree': 0.999451324065874, 'min_child_weight': 10, 'reg_alpha': 0.5562835084224718, 'reg_lambda': 0.4997926133434165, 'scale_pos_weight': 0.8821317215376284}
final_model = XGBClassifier(
    **xgb_params,
    objective='binary:logistic',
    random_state=42
)

In [18]:
test_data = pd.read_csv('/content/test.csv')

model_xgb = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', final_model)
])

model_xgb.fit(X_train, y_train)
test_data['class'] = model_xgb.predict(test_data.drop('id', axis=1))
test_data['class'] = test_data['class'].replace({0: 'e', 1: 'p'})
test_data[['id', 'class']].to_csv('predictions.csv', index=False)


