In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Feature Engineering
def engineer_features(df):
    df = df.copy()
    
    # Create total companions
    df['num_females'] = df['num_females'].fillna(0)
    df['num_males'] = df['num_males'].fillna(0)
    df['total_people'] = df['num_females'] + df['num_males']
    
    # Create total stay duration
    df['total_nights'] = df['mainland_stay_nights'] + df['island_stay_nights']
    
    # Interaction: Is the traveler alone?
    df['is_alone'] = (df['total_people'] <= 1).astype(int)
    
    # Simplify high cardinality countries
    top_countries = df['country'].value_counts().nlargest(15).index
    df['country_grouped'] = df['country'].apply(lambda x: x if x in top_countries else 'Other')
    
    return df

print("Engineering features...")
train_eng = engineer_features(train)
test_eng = engineer_features(test)

# 3. Prepare X and y
train_eng = train_eng.dropna(subset=['spend_category'])
y = train_eng['spend_category'].astype(int)
X = train_eng.drop(['spend_category', 'trip_id', 'country'], axis=1)
X_test = test_eng.drop(['trip_id', 'country'], axis=1)

# 4. Preprocessing Pipeline
numeric_features = ['num_females', 'num_males', 'mainland_stay_nights', 'island_stay_nights', 'total_people', 'total_nights']
categorical_features = [col for col in X.columns if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ])

# 5. Define Optuna Objective for SVM
def objective(trial):
    # Suggest hyperparameters
    
    # Kernel: RBF is generally the most versatile, but Linear can be good for high dimensions
    kernel = trial.suggest_categorical('kernel', ['rbf', 'linear', 'poly'])
    
    # C: Regularization parameter. Log scale exploration is standard.
    C = trial.suggest_float('C', 1e-3, 100.0, log=True)
    
    # Gamma: Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    
    # Class weight: Balanced is often critical for imbalanced data
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])

    # Specific params for poly kernel (degree)
    degree = 3
    if kernel == 'poly':
        degree = trial.suggest_int('degree', 2, 4)

    # Initialize model
    model = SVC(
        kernel=kernel,
        C=C,
        gamma=gamma,
        degree=degree,
        class_weight=class_weight,
        random_state=42,
        max_iter=2000 # Limit iterations to speed up tuning slightly, though SVM usually converges faster
    )

    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])

    # Cross-validation
    # Using 3 folds for SVM tuning because it is computationally expensive compared to Logistic Regression
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy', n_jobs=-1)

    return scores.mean()

# 6. Run Optimization
print("Starting Optuna optimization (this may take longer due to SVM complexity)...")
study = optuna.create_study(direction='maximize')
# Fewer trials for SVM because it's slower to train
study.optimize(objective, n_trials=20) 

print("\nBest trial:")
trial = study.best_trial
print(f"  Value (Accuracy): {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# 7. Retrain Best Model on Full Data
print("\nRetraining best model on full dataset...")

best_params = trial.params
degree = best_params.get('degree', 3)

final_model = SVC(
    kernel=best_params['kernel'],
    C=best_params['C'],
    gamma=best_params['gamma'],
    degree=degree,
    class_weight=best_params['class_weight'],
    probability=True, # Enable probability for final predictions if needed later
    random_state=42
)

final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', final_model)])

final_pipeline.fit(X, y)

# 8. Predict and Save
print("Predicting on Test set...")
test_predictions = final_pipeline.predict(X_test)

submission = pd.DataFrame({
    'trip_id': test['trip_id'],
    'spend_category': test_predictions
})

filename = 'submission_svm_optuna.csv'
submission.to_csv(filename, index=False)
print(f"Submission saved to '{filename}'")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-11-27 18:45:32,625] A new study created in memory with name: no-name-ba731302-2222-42fd-a14a-bb198cbdec06


Loading data...
Engineering features...
Starting Optuna optimization (this may take longer due to SVM complexity)...


[I 2025-11-27 18:45:41,778] Trial 0 finished with value: 0.5344689087238951 and parameters: {'kernel': 'rbf', 'C': 0.04839981001041211, 'gamma': 'auto', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.5344689087238951.
[I 2025-11-27 18:45:45,518] Trial 1 finished with value: 0.5608570473856814 and parameters: {'kernel': 'poly', 'C': 31.81055504611549, 'gamma': 'scale', 'class_weight': None, 'degree': 3}. Best is trial 1 with value: 0.5608570473856814.
[I 2025-11-27 18:45:54,337] Trial 2 finished with value: 0.11600634060110776 and parameters: {'kernel': 'rbf', 'C': 0.001062894834292721, 'gamma': 'scale', 'class_weight': 'balanced'}. Best is trial 1 with value: 0.5608570473856814.
[I 2025-11-27 18:45:58,902] Trial 3 finished with value: 0.38906496855564154 and parameters: {'kernel': 'poly', 'C': 0.2895343009187311, 'gamma': 'auto', 'class_weight': None, 'degree': 4}. Best is trial 1 with value: 0.5608570473856814.
[I 2025-11-27 18:46:03,067] Trial 4 finished with value: 0.426


Best trial:
  Value (Accuracy): 0.7451
  Params: 
    kernel: rbf
    C: 1.0902105201619523
    gamma: scale
    class_weight: None

Retraining best model on full dataset...
Predicting on Test set...
Submission saved to 'submission_svm_optuna.csv'
