In [11]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

In [12]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [13]:
# Fill missing values for numeric columns only
numeric_columns_train = train.select_dtypes(include=['number']).columns
numeric_columns_test = test.select_dtypes(include=['number']).columns

train[numeric_columns_train] = train[numeric_columns_train].fillna(train[numeric_columns_train].median())
test[numeric_columns_test] = test[numeric_columns_test].fillna(test[numeric_columns_test].median())

In [14]:
features = train.drop(columns=['ID', 'Label']).columns
target = 'Label'
selector = SelectKBest(f_classif, k=10)
train_selected = selector.fit_transform(train[features], train[target])
test_selected = selector.transform(test[features])
selected_features = train[features].columns[selector.get_support()]

In [15]:
scaler = StandardScaler()
train[selected_features] = scaler.fit_transform(train[selected_features])
test[selected_features] = scaler.transform(test[selected_features])

In [16]:
smote = SMOTE(random_state=1221)
X_sm, y_sm = smote.fit_resample(train[selected_features], train[target])

In [17]:
xgb_model = XGBClassifier(random_state=1221)

In [18]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1221)
cv_scores = cross_val_score(xgb_model, X_sm, y_sm, cv=stratified_kfold, scoring='f1')
print(f"Cross-validation F1 scores: {cv_scores}")
print(f"Mean F1 score: {cv_scores.mean()}")

Cross-validation F1 scores: [0.98076923 0.98533951 0.97851113 0.98230769 0.98314176]
Mean F1 score: 0.9820138639735283


In [10]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2]
}

grid_search = GridSearchCV(xgb_model, param_grid, cv=stratified_kfold, scoring='f1', n_jobs=-1, verbose=2)
grid_search.fit(X_sm, y_sm)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


In [19]:
best_xgb_model = grid_search.best_estimator_
print("Best XGBoost parameters:", grid_search.best_params_)

Best XGBoost parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 300, 'subsample': 0.9}


In [20]:
best_xgb_model.fit(X_sm, y_sm)
test['Target'] = best_xgb_model.predict(test[selected_features])

In [21]:
print("Test set classification report:")
print(classification_report(test['Target'], test['Target']))

Test set classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2811
           1       1.00      1.00      1.00      1007

    accuracy                           1.00      3818
   macro avg       1.00      1.00      1.00      3818
weighted avg       1.00      1.00      1.00      3818



In [22]:
sub = test[['ID', 'Target']]
sub.to_csv('optimized_submission_cryptooooo.csv', index=False)