**Structure:**
1. **Pre-processing** : Data loading and exploration
2. **Feature Selection** : Dimensionality reduction
3. **Validation** : Cross-validation setup
4. **Algorithms and Optimization** : Model training and comparison
5. **Submisisons** : Model selection and submission

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest, VarianceThreshold, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer
import xgboost as xgb

---

## 1. Pre-processing

Loading and analyzing the dataset

### 1.1 Data Loading

In [None]:
train = np.load('data/train.npz')
test = np.load('data/test.npz')

X_train = train['X_train']
y_train = train['y_train']
train_ids = train['ids']

X_test = test['X_test']
test_ids = test['ids']

print(f"Training: {X_train.shape[0]:,} samples x {X_train.shape[1]:,} features")
print(f"Test: {X_test.shape[0]:,} samples")

In [None]:
n_samples, n_features = X_train.shape
n_susceptible = np.sum(y_train == 0)
n_resistant = np.sum(y_train == 1)
imbalance_ratio = n_susceptible / n_resistant

non_zero = np.count_nonzero(X_train)
total_entries = n_samples * n_features
sparsity = 100 * (1 - non_zero / total_entries)

print(f"Samples: {n_samples:,} , Features: {n_features:,}")
print(f"Class distribution: {n_susceptible:,} susceptible ({100*n_susceptible/n_samples:.1f}%), {n_resistant:,} resistant ({100*n_resistant/n_samples:.1f}%)")
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"Matrix sparsity: {sparsity:.2f}%")

In [None]:
class_counts = pd.Series(y_train).value_counts().sort_index()
class_labels = ['Susceptible', 'Resistant']

plt.figure(figsize=(8, 4))
bars = plt.bar(class_labels, class_counts.values, color=['#2ecc71', '#e74c3c'], alpha=0.5)
plt.ylabel('Count', fontsize=11)
plt.title('Class Distribution', fontsize=11)
plt.grid(axis='y', alpha=0.2)

for i, (bar, count) in enumerate(zip(bars, class_counts.values)):
    percentage = 100 * count / len(y_train)
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 20, 
             f'{count:,}\n({percentage:.1f}%)', ha='center', fontsize=11)

plt.tight_layout()
plt.show()

### 1.2 Exploratory Data Analysis

In [None]:
# Test different numbers of features with Chi-square
k_values = [5000, 10000, 15000, 20000, 30000]

variance_threshold = VarianceThreshold(threshold=0.01)
X_train_var = variance_threshold.fit_transform(X_train)
X_test_var = variance_threshold.transform(X_test)

print(f"After variance threshold: {X_train_var.shape[1]:,} features\n")

results = []
for k in k_values:
    if k > X_train_var.shape[1]:
        continue
    
    selector = SelectKBest(chi2, k=k)
    X_train_k = selector.fit_transform(X_train_var, y_train)
    X_test_k = selector.transform(X_test_var)
    
    # test with Logistic Regression
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42, n_jobs=-1)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(lr, X_train_k, y_train, cv=cv, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1)
    
    results.append({'k': k, 'cv_f1': scores.mean(), 'cv_std': scores.std()})
    print(f"K={k:6,}: CV F1 = {scores.mean():.4f} Â± {scores.std():.4f}")

best_result = max(results, key=lambda x: x['cv_f1'])
K_FEATURES = best_result['k']
print(f"\nBest K: {K_FEATURES:,} features (CV F1 = {best_result['cv_f1']:.4f})")

# Apply best K
selector = SelectKBest(chi2, k=K_FEATURES)
X_train_selected = selector.fit_transform(X_train_var, y_train)
X_test_selected = selector.transform(X_test_var)

print(f"\nFinal feature selection: {X_train_selected.shape[1]:,} features")

---

## 2. Feature Selection

Applying filtering to reduce from 1,000,000 to 30,000 features

### 2.1 Feature Selection Strategy


In [None]:
feature_scores = selector.scores_

plt.figure(figsize=(8, 4))
plt.hist(feature_scores, bins=50, color='#3498db', alpha=0.5)
plt.xlabel('Score', fontsize=11)
plt.ylabel('Frequency', fontsize=11)
plt.title('Feature Score Distribution', fontsize=11)
plt.yscale('log')
plt.grid(axis='y', alpha=0.2)

plt.tight_layout()
plt.show()

---

## 3. Validation

Setup cross-validation for model evaluation

### 3.1 Validation Strategy


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')

---

## 4. Algorithms & Optimization

Train and optimize classification models with grid search

### 4.1 Logistic Regression Baseline

In [None]:
lr_baseline = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42, n_jobs=-1)
lr_scores = cross_val_score(lr_baseline, X_train_selected, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)

print(f"Logistic Regression Baseline: CV F1 = {lr_scores.mean():.4f}")

lr_baseline.fit(X_train_selected, y_train)
baseline_score = lr_scores.mean()

### 4.2 Grid Search Configuration

In [None]:
from sklearn.model_selection import GridSearchCV

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scorer = make_scorer(f1_score, average='macro')

rf_params = {
    'n_estimators': [100, 150, 200, 250],
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_split': [5, 10, 15, 20],
    'class_weight': ['balanced']
}

lr_params = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

svm_params = {
    'C': [0.1, 1.0, 10.0],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto']
}

# Calculate scale_pos_weight for class imbalance
scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [scale_pos_weight]
}

print(f"Using 5-fold cross-validation")
print(f"Class imbalance ratio: {scale_pos_weight:.2f}:1")

### 4.3 Random Forest Grid Search

In [None]:
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    rf_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train_selected, y_train)

print(f"Best RF: F1 = {rf_grid.best_score_:.4f}, Params = {rf_grid.best_params_}")

### 4.4 Logistic Regression Grid Search

In [None]:
lr_grid = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42, n_jobs=-1),
    lr_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

lr_grid.fit(X_train_selected, y_train)

print(f"Best LR: F1 = {lr_grid.best_score_:.4f}, Params = {lr_grid.best_params_}")

### 4.5 SVM Grid Search

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

svm_grid = GridSearchCV(
    SVC(class_weight='balanced', random_state=42, cache_size=1000),
    svm_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

svm_grid.fit(X_train_scaled, y_train)

print(f"Best SVM: F1 = {svm_grid.best_score_:.4f}, Params = {svm_grid.best_params_}")

### 4.6 XGBoost Grid Search

In [None]:
xgb_grid = GridSearchCV(
    xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    ),
    xgb_params,
    cv=cv,
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

xgb_grid.fit(X_train_selected, y_train)

print(f"Best XGBoost: F1 = {xgb_grid.best_score_:.4f}, Params = {xgb_grid.best_params_}")

### 4.7 Results Summary

In [None]:
rf_results = pd.DataFrame(rf_grid.cv_results_)
lr_results = pd.DataFrame(lr_grid.cv_results_)
svm_results = pd.DataFrame(svm_grid.cv_results_)
xgb_results = pd.DataFrame(xgb_grid.cv_results_)

rf_all = rf_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
lr_all = lr_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
svm_all = svm_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
xgb_all = xgb_results.sort_values('mean_test_score', ascending=False)[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]

all_results = []

for idx, row in rf_all.iterrows():
    all_results.append({
        'Model': 'RandomForest',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

for idx, row in lr_all.iterrows():
    all_results.append({
        'Model': 'LogisticRegression',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

for idx, row in svm_all.iterrows():
    all_results.append({
        'Model': 'SVM',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

for idx, row in xgb_all.iterrows():
    all_results.append({
        'Model': 'XGBoost',
        'Rank': int(row['rank_test_score']),
        'CV_F1': row['mean_test_score'],
        'CV_Std': row['std_test_score'],
        'Config': str(row['params'])
    })

results_df = pd.DataFrame(all_results).sort_values('CV_F1', ascending=False)

print("\nTop 15 Configurations:")
print(results_df.head(15).to_string(index=False))

best = results_df.iloc[0]
print(f"\nBest Model: {best['Model']} - CV F1 = {best['CV_F1']:.4f}")

results_df.to_csv('grid_search_results.csv', index=False)

## 5. Submissions

---

### 5.1 Random Forest Submission

Generate submission using Random Forest

In [None]:
rf = RandomForestClassifier(
    **rf_grid.best_params_,
    random_state=42,
    n_jobs=-1
)

rf_scores = cross_val_score(rf, X_train_selected, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)

rf.fit(X_train_selected, y_train)
y_test_pred_rf = rf.predict(X_test_selected)

submission_rf = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_rf
})

submission_rf.to_csv('rf_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_rf == 1)} ({100*np.sum(y_test_pred_rf == 1)/len(y_test_pred_rf):.1f}%)")

---

### 5.2 SVM Submission

Generate submission using SVM

In [None]:
svm_best = SVC(
    **svm_grid.best_params_,
    class_weight='balanced',
    random_state=42,
    cache_size=1000
)

svm_best_scores = cross_val_score(svm_best, X_train_scaled, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)

svm_best.fit(X_train_scaled, y_train)
y_test_pred_svm = svm_best.predict(X_test_scaled)

submission_svm_df = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_svm
})

submission_svm_df.to_csv('svm_rbf_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_svm == 1)} ({100*np.sum(y_test_pred_svm == 1)/len(y_test_pred_svm):.1f}%)")

---

### 5.3 XGBoost Submission

Generate submission using best XGBoost configuration from grid search

In [None]:
xgb_best = xgb.XGBClassifier(
    **xgb_grid.best_params_,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

xgb_best_scores = cross_val_score(xgb_best, X_train_selected, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)

xgb_best.fit(X_train_selected, y_train)
y_test_pred_xgb = xgb_best.predict(X_test_selected)

submission_xgb_df = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_xgb
})

submission_xgb_df.to_csv('xgboost_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_xgb == 1)} ({100*np.sum(y_test_pred_xgb == 1)/len(y_test_pred_xgb):.1f}%)")

---

### 5.4 Best Kaggle Submission

Best performing XGBoost configuration (CV F1 = 0.8291)

In [None]:
xgb_best_kaggle = xgb.XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.01,
    max_depth=10,
    n_estimators=300,
    scale_pos_weight=6.102564102564102,
    subsample=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

xgb_best_kaggle_scores = cross_val_score(xgb_best_kaggle, X_train_selected, y_train, cv=cv, scoring=f1_scorer, n_jobs=-1)


xgb_best_kaggle.fit(X_train_selected, y_train)
y_test_pred_best_kaggle = xgb_best_kaggle.predict(X_test_selected)

submission_best_kaggle = pd.DataFrame({
    'id': test_ids,
    'label': y_test_pred_best_kaggle
})

submission_best_kaggle.to_csv('best_kaggle_submission.csv', index=False)
print(f"Predicted resistant: {np.sum(y_test_pred_best_kaggle == 1)} ({100*np.sum(y_test_pred_best_kaggle == 1)/len(y_test_pred_best_kaggle):.1f}%)")