In [1]:
import pandas as pd
import numpy as np
import optuna
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Feature Engineering
def engineer_features(df):
    df = df.copy()
    
    # Create total companions
    df['num_females'] = df['num_females'].fillna(0)
    df['num_males'] = df['num_males'].fillna(0)
    df['total_people'] = df['num_females'] + df['num_males']
    
    # Create total stay duration
    df['total_nights'] = df['mainland_stay_nights'] + df['island_stay_nights']
    
    # Interaction: Is the traveler alone?
    df['is_alone'] = (df['total_people'] <= 1).astype(int)
    
    # Simplify high cardinality countries
    top_countries = df['country'].value_counts().nlargest(15).index
    df['country_grouped'] = df['country'].apply(lambda x: x if x in top_countries else 'Other')
    
    return df

print("Engineering features...")
train_eng = engineer_features(train)
test_eng = engineer_features(test)

# 3. Prepare X and y
train_eng = train_eng.dropna(subset=['spend_category'])
y = train_eng['spend_category'].astype(int)
X = train_eng.drop(['spend_category', 'trip_id', 'country'], axis=1)
X_test = test_eng.drop(['trip_id', 'country'], axis=1)

# 4. Preprocessing Pipeline
numeric_features = ['num_females', 'num_males', 'mainland_stay_nights', 'island_stay_nights', 'total_people', 'total_nights']
categorical_features = [col for col in X.columns if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ])

# 5. Define Optuna Objective
def objective(trial):
    # Suggest hyperparameters
    # Solver: liblinear is good for small/medium datasets, lbfgs is standard
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs'])
    
    # C: Inverse of regularization strength (log scale is usually best)
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)
    
    # Class weight: None or Balanced
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    
    # Penalty handling based on solver compatibility
    if solver == 'liblinear':
        penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    else:
        penalty = 'l2' # lbfgs only supports l2 (standard implementation)

    # Initialize model
    model = LogisticRegression(
        solver=solver,
        C=C,
        penalty=penalty,
        class_weight=class_weight,
        max_iter=1000, # Ensure convergence
        random_state=42
    )

    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])

    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')

    return scores.mean()

# 6. Run Optimization
print("Starting Optuna optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # Run 50 trials

print("\nBest trial:")
trial = study.best_trial
print(f"  Value (Accuracy): {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# 7. Retrain Best Model on Full Data
print("\nRetraining best model on full dataset...")

best_params = trial.params
# Handle conditional penalty again for final model creation
solver = best_params['solver']
penalty = best_params.get('penalty', 'l2') 
if solver == 'lbfgs': 
    penalty = 'l2'

final_model = LogisticRegression(
    solver=solver,
    C=best_params['C'],
    penalty=penalty,
    class_weight=best_params['class_weight'],
    max_iter=1000,
    random_state=42
)

final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', final_model)])

final_pipeline.fit(X, y)

# 8. Predict and Save
print("Predicting on Test set...")
test_predictions = final_pipeline.predict(X_test)

submission = pd.DataFrame({
    'trip_id': test['trip_id'],
    'spend_category': test_predictions
})

filename = 'submission_logistic_optuna.csv'
submission.to_csv(filename, index=False)
print(f"Submission saved to '{filename}'")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-11-27 18:42:15,429] A new study created in memory with name: no-name-7688ff2e-8592-46fb-94c4-e2caf2b2cf6a


Loading data...
Engineering features...
Starting Optuna optimization...


[I 2025-11-27 18:42:16,109] Trial 0 finished with value: 0.669175911251981 and parameters: {'solver': 'lbfgs', 'C': 0.0013772527395816965, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.669175911251981.
[I 2025-11-27 18:42:17,225] Trial 1 finished with value: 0.743581616481775 and parameters: {'solver': 'lbfgs', 'C': 0.011856615148795267, 'class_weight': None}. Best is trial 1 with value: 0.743581616481775.
[I 2025-11-27 18:42:18,660] Trial 2 finished with value: 0.7007131537242473 and parameters: {'solver': 'lbfgs', 'C': 0.3083214778983625, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.743581616481775.
[I 2025-11-27 18:42:19,005] Trial 3 finished with value: 0.4948494453248811 and parameters: {'solver': 'liblinear', 'C': 0.00015575458009932792, 'class_weight': None, 'penalty': 'l1'}. Best is trial 1 with value: 0.743581616481775.
[I 2025-11-27 18:42:19,740] Trial 4 finished with value: 0.7305863708399366 and parameters: {'solver': 'liblinear', 'C': 0.028137957


Best trial:
  Value (Accuracy): 0.7479
  Params: 
    solver: lbfgs
    C: 0.07342928888633839
    class_weight: None

Retraining best model on full dataset...
Predicting on Test set...
Submission saved to 'submission_logistic_optuna.csv'
