**Sections:**
1. **Pre-processing** : Data loading and exploration
2. **Algorithms and Optimization** : Model training and comparison
3. **Submissions** : Model selection and submission

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest, VarianceThreshold, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer
import xgboost as xgb

## 1. Pre-processing

Loading and analyzing the dataset

### 1.1 Data Loading

In [None]:
train = np.load('data/train.npz')
test = np.load('data/test.npz')

X_train = train['X_train']
y_train = train['y_train']
train_ids = train['ids']

X_test = test['X_test']
test_ids = test['ids']

print(f"Training: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features")
print(f"Test: {X_test.shape[0]:,} samples")

### 1.2 Data Exploration

Analyzing class distribution and dataset characteristics

In [None]:
n_samples, n_features = X_train.shape
n_susceptible = np.sum(y_train == 0)
n_resistant = np.sum(y_train == 1)
imbalance_ratio = n_susceptible / n_resistant

non_zero = np.count_nonzero(X_train)
total_entries = n_samples * n_features
sparsity = 100 * (1 - non_zero / total_entries)

print(f"Samples: {n_samples:,} , Features: {n_features:,}")
print(f"Class distribution: {n_susceptible:,} susceptible ({100*n_susceptible/n_samples:.1f}%), {n_resistant:,} resistant ({100*n_resistant/n_samples:.1f}%)")

In [None]:
class_counts = pd.Series(y_train).value_counts().sort_index()
class_labels = ['Susceptible', 'Resistant']

plt.figure(figsize=(15, 5))
bars = plt.bar(class_labels, class_counts.values, color=['blue', 'red'], alpha=0.5)
plt.ylabel('Count', fontsize=12)
plt.title('Class Distribution', fontsize=12)
plt.grid(axis='y', alpha=0.5)

for i, (bar, count) in enumerate(zip(bars, class_counts.values)):
    percentage = 100 * count / len(y_train)
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20, 
             f'{count:,}\n({percentage:.1f}%)', ha='center', fontsize=12)

plt.tight_layout()
plt.savefig('class_distribution.png')
plt.show()

### 1.3 Feature Selection

Testing different feature counts and applying variance threshold and chi-square selection

In [None]:

k_values = [5000, 10000, 15000, 20000, 30000]

variance_threshold = VarianceThreshold(threshold=0.01)
X_train_var = variance_threshold.fit_transform(X_train)
X_test_var = variance_threshold.transform(X_test)

print(f"After variance threshold: {X_train_var.shape[1]:,} features\n")

results = []
for k in k_values:
    if k > X_train_var.shape[1]:
        continue
    
    selector = SelectKBest(chi2, k=k)
    X_train_k = selector.fit_transform(X_train_var, y_train)
    X_test_k = selector.transform(X_test_var)
    
    # test with Logistic Regression
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42, n_jobs=-1)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(lr, X_train_k, y_train, cv=cv, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1)
    
    results.append({'k': k, 'cv_f1': scores.mean(), 'cv_std': scores.std()})
    print(f"K={k:6,}: CV F1 = {scores.mean():.4f} and std deviation of {scores.std():.4f}")

best_result = max(results, key=lambda x: x['cv_f1'])
K_FEATURES = best_result['k']
print(f"\nBest K: {K_FEATURES:,} features (CV F1 = {best_result['cv_f1']:.4f})")

results_df = pd.DataFrame(results)
results_df.to_csv('feature_selection_results.csv', index=False)
print(f"Feature selection results saved to feature_selection_results.csv")

# Apply best K
selector = SelectKBest(chi2, k=K_FEATURES)
X_train_selected = selector.fit_transform(X_train_var, y_train)
X_test_selected = selector.transform(X_test_var)

print(f"\nFinal feature selection: {X_train_selected.shape[1]:,} features")
# 30000 is best


## 2. Training Algorithms

Training classification models with grid search

### 2.1 Grid Search Configuration

In [None]:
from sklearn.model_selection import GridSearchCV

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')

rf_params = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_split': [5, 10, 15, 20],
    'class_weight': ['balanced']
}

lr_params = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

svm_params = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

# Calculate scale_pos_weight for class imbalance
scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [scale_pos_weight]
}

print(f"Using 5-fold cross-validation")
print(f"Class imbalance ratio: {scale_pos_weight:.2f}:1")

### 2.2 Random Forest Grid Search

In [None]:
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    rf_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train_selected, y_train)

print(f"Best RF: F1 = {rf_grid.best_score_:.4f}, Params = {rf_grid.best_params_}")

### 2.3 Logistic Regression Grid Search

In [None]:
lr_grid = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42, n_jobs=1),
    lr_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=2,
    verbose=1
)

lr_grid.fit(X_train_selected, y_train)

print(f"Best LR: F1 = {lr_grid.best_score_:.4f}, Params = {lr_grid.best_params_}")

### 2.4 SVM Grid Search

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

svm_grid = GridSearchCV(
    SVC(class_weight='balanced', random_state=42, cache_size=1000),
    svm_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

svm_grid.fit(X_train_scaled, y_train)

print(f"Best SVM: F1 = {svm_grid.best_score_:.4f}, Params = {svm_grid.best_params_}")

### 2.5 XGBoost Grid Search

In [None]:
xgb_grid = GridSearchCV(
    xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    ),
    xgb_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

xgb_grid.fit(X_train_selected, y_train)

print(f"Best XGBoost: F1 = {xgb_grid.best_score_:.4f}, Params = {xgb_grid.best_params_}")

### 2.6 Results Summary

In [None]:
rf_results = pd.DataFrame(rf_grid.cv_results_)
lr_results = pd.DataFrame(lr_grid.cv_results_)
svm_results = pd.DataFrame(svm_grid.cv_results_)
xgb_results = pd.DataFrame(xgb_grid.cv_results_)

rf_all = rf_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
lr_all = lr_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
svm_all = svm_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
xgb_all = xgb_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

all_results = []

for idx, row in rf_all.iterrows():
    all_results.append({
        'Model': 'RandomForest',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

for idx, row in lr_all.iterrows():
    all_results.append({
        'Model': 'LogisticRegression',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

for idx, row in svm_all.iterrows():
    all_results.append({
        'Model': 'SVM',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

for idx, row in xgb_all.iterrows():
    all_results.append({
        'Model': 'XGBoost',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

results_df = pd.DataFrame(all_results).sort_values('CV_F1', ascending=False)

print("\nTop 15 Configurations:")
print(results_df.head(15).to_string(index=False))

best = results_df.iloc[0]
print(f"\nBest Model: {best['Model']} CV F1 = {best['CV_F1']:.4f}")

results_df.to_csv('grid_search_results.csv', index=False)

### 2.7 Results

Visualizing model performance and hyperparameter impact

In [None]:

best_scores = {
    'Random Forest': rf_grid.best_score_,
    'Logistic Regression': lr_grid.best_score_,
    'SVM': svm_grid.best_score_,
    'XGBoost': xgb_grid.best_score_
}

fig, ax = plt.subplots(1, 1, figsize=(15, 5))

models = list(best_scores.keys())
scores = list(best_scores.values())
colors = ['blue', 'red', 'green', 'orange']

bars = ax.bar(models, scores, color=colors, alpha=0.5)
ax.set_ylabel('F1 Score', fontsize=12)
ax.set_title('Model Performance Comparison', fontsize=12)
ax.set_ylim(min(scores) - 0.01, max(scores) + 0.01)
ax.grid(axis='y', alpha=0.5)

for bar, score in zip(bars, scores):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{score:.4f}', ha='center', va='bottom', fontsize=12)

plt.tight_layout()
plt.savefig('model_comparison.png')
plt.show()

print(f"Best Model: {max(best_scores, key=best_scores.get)} (F1 = {max(best_scores.values()):.4f})")

In [None]:

fig, axes = plt.subplots(3, 4, figsize=(20, 15))

# Random Forest: n_estimators
rf_params_analysis = []
for idx, row in rf_results.iterrows():
    params = row['params']
    rf_params_analysis.append({
        'n_estimators': params['n_estimators'],
        'score': row['mean_test_score']
    })
rf_df = pd.DataFrame(rf_params_analysis)
rf_grouped = rf_df.groupby('n_estimators')['score'].mean().sort_index()

axes[0, 0].plot(rf_grouped.index, rf_grouped.values, marker='s', linewidth=2, markersize=8, color='blue', alpha=0.5)
axes[0, 0].set_xlabel('Number of Estimators', fontsize=12)
axes[0, 0].set_ylabel('Mean F1 Score', fontsize=12)
axes[0, 0].set_title('Random Forest: n_estimators', fontsize=12)
axes[0, 0].grid(alpha=0.5)

# Random Forest: max_depth
rf_depth_analysis = []
for idx, row in rf_results.iterrows():
    params = row['params']
    depth = params['max_depth'] if params['max_depth'] is not None else 100
    rf_depth_analysis.append({
        'max_depth': depth,
        'score': row['mean_test_score']
    })
rf_depth_df = pd.DataFrame(rf_depth_analysis)
rf_depth_grouped = rf_depth_df.groupby('max_depth')['score'].mean().sort_index()

axes[0, 1].plot(rf_depth_grouped.index, rf_depth_grouped.values, marker='^', linewidth=2, markersize=8, color='blue', alpha=0.5)
axes[0, 1].set_xlabel('Max Depth', fontsize=12)
axes[0, 1].set_ylabel('Mean F1 Score', fontsize=12)
axes[0, 1].set_title('Random Forest: max_depth', fontsize=12)
axes[0, 1].grid(alpha=0.5)

# Random Forest: min_samples_split
rf_split_analysis = []
for idx, row in rf_results.iterrows():
    params = row['params']
    rf_split_analysis.append({
        'min_samples_split': params['min_samples_split'],
        'score': row['mean_test_score']
    })
rf_split_df = pd.DataFrame(rf_split_analysis)
rf_split_grouped = rf_split_df.groupby('min_samples_split')['score'].mean().sort_index()

axes[0, 2].plot(rf_split_grouped.index, rf_split_grouped.values, marker='D', linewidth=2, markersize=8, color='blue', alpha=0.5)
axes[0, 2].set_xlabel('Min Samples Split', fontsize=12)
axes[0, 2].set_ylabel('Mean F1 Score', fontsize=12)
axes[0, 2].set_title('Random Forest: min_samples_split', fontsize=12)
axes[0, 2].grid(alpha=0.5)

# XGBoost: learning_rate
xgb_lr_analysis = []
for idx, row in xgb_results.iterrows():
    params = row['params']
    xgb_lr_analysis.append({
        'learning_rate': params['learning_rate'],
        'score': row['mean_test_score']
    })
xgb_lr_df = pd.DataFrame(xgb_lr_analysis)
xgb_lr_grouped = xgb_lr_df.groupby('learning_rate')['score'].mean().sort_index()

axes[0, 3].plot(xgb_lr_grouped.index, xgb_lr_grouped.values, marker='o', linewidth=2, markersize=8, color='orange', alpha=0.5)
axes[0, 3].set_xlabel('Learning Rate', fontsize=12)
axes[0, 3].set_ylabel('Mean F1 Score', fontsize=12)
axes[0, 3].set_title('XGBoost: learning_rate', fontsize=12)
axes[0, 3].grid(alpha=0.5)

# XGBoost: max_depth
xgb_depth_analysis = []
for idx, row in xgb_results.iterrows():
    params = row['params']
    xgb_depth_analysis.append({
        'max_depth': params['max_depth'],
        'score': row['mean_test_score']
    })
xgb_depth_df = pd.DataFrame(xgb_depth_analysis)
xgb_depth_grouped = xgb_depth_df.groupby('max_depth')['score'].mean().sort_index()

axes[1, 0].plot(xgb_depth_grouped.index, xgb_depth_grouped.values, marker='*', linewidth=2, markersize=10, color='orange', alpha=0.5)
axes[1, 0].set_xlabel('Max Depth', fontsize=12)
axes[1, 0].set_ylabel('Mean F1 Score', fontsize=12)
axes[1, 0].set_title('XGBoost: max_depth', fontsize=12)
axes[1, 0].grid(alpha=0.5)

# XGBoost: n_estimators
xgb_nest_analysis = []
for idx, row in xgb_results.iterrows():
    params = row['params']
    xgb_nest_analysis.append({
        'n_estimators': params['n_estimators'],
        'score': row['mean_test_score']
    })
xgb_nest_df = pd.DataFrame(xgb_nest_analysis)
xgb_nest_grouped = xgb_nest_df.groupby('n_estimators')['score'].mean().sort_index()

axes[1, 1].plot(xgb_nest_grouped.index, xgb_nest_grouped.values, marker='p', linewidth=2, markersize=8, color='orange', alpha=0.5)
axes[1, 1].set_xlabel('Number of Estimators', fontsize=12)
axes[1, 1].set_ylabel('Mean F1 Score', fontsize=12)
axes[1, 1].set_title('XGBoost: n_estimators', fontsize=12)
axes[1, 1].grid(alpha=0.5)

# XGBoost: subsample
xgb_sub_analysis = []
for idx, row in xgb_results.iterrows():
    params = row['params']
    xgb_sub_analysis.append({
        'subsample': params['subsample'],
        'score': row['mean_test_score']
    })
xgb_sub_df = pd.DataFrame(xgb_sub_analysis)
xgb_sub_grouped = xgb_sub_df.groupby('subsample')['score'].mean().sort_index()

axes[1, 2].plot(xgb_sub_grouped.index, xgb_sub_grouped.values, marker='h', linewidth=2, markersize=8, color='orange', alpha=0.5)
axes[1, 2].set_xlabel('Subsample', fontsize=12)
axes[1, 2].set_ylabel('Mean F1 Score', fontsize=12)
axes[1, 2].set_title('XGBoost: subsample', fontsize=12)
axes[1, 2].grid(alpha=0.5)

# Logistic Regression: C parameter
lr_c_analysis = []
for idx, row in lr_results.iterrows():
    params = row['params']
    lr_c_analysis.append({
        'C': params['C'],
        'score': row['mean_test_score']
    })
lr_c_df = pd.DataFrame(lr_c_analysis)
lr_c_grouped = lr_c_df.groupby('C')['score'].mean().sort_index()

axes[1, 3].plot(lr_c_grouped.index, lr_c_grouped.values, marker='P', linewidth=2, markersize=8, color='red', alpha=0.5)
axes[1, 3].set_xlabel('C', fontsize=12)
axes[1, 3].set_ylabel('Mean F1 Score', fontsize=12)
axes[1, 3].set_title('Logistic Regression: C', fontsize=12)
axes[1, 3].set_xscale('log')
axes[1, 3].grid(alpha=0.5)

# SVM: kernel
svm_kernel_analysis = []
for idx, row in svm_results.iterrows():
    params = row['params']
    svm_kernel_analysis.append({
        'kernel': params['kernel'],
        'score': row['mean_test_score']
    })
svm_kernel_df = pd.DataFrame(svm_kernel_analysis)
svm_kernel_grouped = svm_kernel_df.groupby('kernel')['score'].mean()

axes[2, 0].bar(svm_kernel_grouped.index, svm_kernel_grouped.values, color=['green', 'lightgreen'], alpha=0.5)
axes[2, 0].set_xlabel('Kernel Type', fontsize=12)
axes[2, 0].set_ylabel('Mean F1 Score', fontsize=12)
axes[2, 0].set_title('SVM: kernel', fontsize=12)
axes[2, 0].grid(axis='y', alpha=0.5)

for i, (kernel, score) in enumerate(svm_kernel_grouped.items()):
    axes[2, 0].text(i, score, f'{score:.4f}', ha='center', va='bottom', fontsize=12)

# SVM: C parameter
svm_c_analysis = []
for idx, row in svm_results.iterrows():
    params = row['params']
    svm_c_analysis.append({
        'C': params['C'],
        'score': row['mean_test_score']
    })
svm_c_df = pd.DataFrame(svm_c_analysis)
svm_c_grouped = svm_c_df.groupby('C')['score'].mean().sort_index()

axes[2, 1].plot(svm_c_grouped.index, svm_c_grouped.values, marker='X', linewidth=2, markersize=8, color='green', alpha=0.5)
axes[2, 1].set_xlabel('C', fontsize=12)
axes[2, 1].set_ylabel('Mean F1 Score', fontsize=12)
axes[2, 1].set_title('SVM: C', fontsize=12)
axes[2, 1].set_xscale('log')
axes[2, 1].grid(alpha=0.5)

# SVM: gamma
svm_gamma_analysis = []
for idx, row in svm_results.iterrows():
    params = row['params']
    svm_gamma_analysis.append({
        'gamma': params['gamma'],
        'score': row['mean_test_score']
    })
svm_gamma_df = pd.DataFrame(svm_gamma_analysis)
svm_gamma_grouped = svm_gamma_df.groupby('gamma')['score'].mean()

axes[2, 2].bar(svm_gamma_grouped.index, svm_gamma_grouped.values, color=['green', 'darkgreen'], alpha=0.5)
axes[2, 2].set_xlabel('Gamma', fontsize=12)
axes[2, 2].set_ylabel('Mean F1 Score', fontsize=12)
axes[2, 2].set_title('SVM: gamma', fontsize=12)
axes[2, 2].grid(axis='y', alpha=0.5)

for i, (gamma, score) in enumerate(svm_gamma_grouped.items()):
    axes[2, 2].text(i, score, f'{score:.4f}', ha='center', va='bottom', fontsize=12)

axes[2, 3].axis('off')

plt.tight_layout()
plt.savefig('hyperparameter_analysis.png')
plt.show()


## 3. Submissions

### 3.1 Random Forest Submission

Generate submission using Random Forest

In [None]:
rf = RandomForestClassifier(
    **rf_grid.best_params_,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_selected, y_train)
y_test_pred_rf = rf.predict(X_test_selected)

submission_rf = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_rf
})

submission_rf.to_csv('rf_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_rf == 1)} ({100*np.sum(y_test_pred_rf == 1)/len(y_test_pred_rf):.1f}%)")

### 3.2 Logistic Regression Submission

Generate submission using Logistic Regression

In [None]:
lr = LogisticRegression(
    **lr_grid.best_params_,
    class_weight='balanced',
    max_iter=1000,
    random_state=42,
    n_jobs=-1
)

lr.fit(X_train_selected, y_train)
y_test_pred_lr = lr.predict(X_test_selected)

submission_lr = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_lr
})

submission_lr.to_csv('lr_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_lr == 1)} ({100*np.sum(y_test_pred_lr == 1)/len(y_test_pred_lr):.1f}%)")

### 3.3 SVM Submission

Generate submission using SVM

In [None]:
svm_best = SVC(
    **svm_grid.best_params_,
    class_weight='balanced',
    random_state=42,
    cache_size=1000
)

svm_best.fit(X_train_scaled, y_train)
y_test_pred_svm = svm_best.predict(X_test_scaled)

submission_svm_df = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_svm
})

submission_svm_df.to_csv('svm_rbf_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_svm == 1)} ({100*np.sum(y_test_pred_svm == 1)/len(y_test_pred_svm):.1f}%)")

### 3.4 XGBoost Submission

Generate submission using best XGBoost configuration from grid search

In [None]:
xgb_best = xgb.XGBClassifier(
    **xgb_grid.best_params_,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

xgb_best.fit(X_train_selected, y_train)
y_test_pred_xgb = xgb_best.predict(X_test_selected)

submission_xgb_df = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_xgb
})

submission_xgb_df.to_csv('xgboost_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_xgb == 1)} ({100*np.sum(y_test_pred_xgb == 1)/len(y_test_pred_xgb):.1f}%)")

### 3.5 Best Kaggle Submission

Best performing XGBoost configuration (CV F1 = 0.8291)

In [None]:
xgb_best_kaggle = xgb.XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=300,
    scale_pos_weight=6.102564102564102,
    subsample=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

xgb_best_kaggle.fit(X_train_selected, y_train)
y_test_pred_best_kaggle = xgb_best_kaggle.predict(X_test_selected)

submission_best_kaggle = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_best_kaggle
})

submission_best_kaggle.to_csv('best_kaggle_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_best_kaggle == 1)} ({100*np.sum(y_test_pred_best_kaggle == 1)/len(y_test_pred_best_kaggle):.1f}%)")