In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# --- CONFIGURATION ---
# EDIT THESE WEIGHTS TO EXPERIMENT
# Order: [Logistic Regression, SVM, MLP]
# Example: [1, 2, 2] gives SVM and MLP double the influence of Logistic Regression
ENSEMBLE_WEIGHTS = [1, 3, 2] 

# Number of folds for validation
N_FOLDS = 5 
# ---------------------

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission_sample = pd.read_csv('sample_submission.csv')

# 2. Feature Engineering Function
def engineer_features(df):
    df = df.copy()
    
    # Create total companions (females + males)
    df['num_females'] = df['num_females'].fillna(0)
    df['num_males'] = df['num_males'].fillna(0)
    df['total_people'] = df['num_females'] + df['num_males']
    
    # Create total stay duration
    df['total_nights'] = df['mainland_stay_nights'] + df['island_stay_nights']
    
    # Interaction: Is the traveler alone?
    df['is_alone'] = (df['total_people'] <= 1).astype(int)
    
    # Simplify high cardinality columns (Example: Keep top 15 countries, others as 'Other')
    top_countries = df['country'].value_counts().nlargest(15).index
    df['country_grouped'] = df['country'].apply(lambda x: x if x in top_countries else 'Other')
    
    return df

print("Engineering features...")
train_eng = engineer_features(train)
test_eng = engineer_features(test)

# 3. Prepare X and y
train_eng = train_eng.dropna(subset=['spend_category']) # Drop rows with missing target
y = train_eng['spend_category'].astype(int)
X = train_eng.drop(['spend_category', 'trip_id', 'country'], axis=1) # Drop original country, keep grouped
X_test = test_eng.drop(['trip_id', 'country'], axis=1)

# 4. Preprocessing Pipeline
numeric_features = ['num_females', 'num_males', 'mainland_stay_nights', 'island_stay_nights', 'total_people', 'total_nights']
categorical_features = [col for col in X.columns if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ])

# 5. Define Models

# Model 1: Logistic Regression
clf_lr = LogisticRegression(
    solver='liblinear',
    C=0.5,
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)

# Model 2: SVM
clf_svm = SVC(
    kernel='rbf',
    C=2.0,
    gamma='scale',
    class_weight='balanced',
    probability=True,
    random_state=42
)

# Model 3: Neural Network
clf_mlp = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate='adaptive',
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    random_state=42
)

# 6. Create Voting Classifier with Configurable Weights
print(f"Initializing Ensemble with Weights: {ENSEMBLE_WEIGHTS}")
ensemble = VotingClassifier(
    estimators=[
        ('lr', clf_lr),
        ('svm', clf_svm),
        ('mlp', clf_mlp)
    ],
    voting='soft',
    weights=ENSEMBLE_WEIGHTS
)

# Create Pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', ensemble)])

# 7. K-Fold Validation
print(f"\nPerforming {N_FOLDS}-Fold Stratified Cross-Validation...")
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Calculate accuracy scores across folds
scores = cross_val_score(model_pipeline, X, y, cv=cv, scoring='accuracy')

print(f"CV Scores per fold: {scores}")
print(f"Mean CV Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Generate cross-validated predictions for a full classification report
# This stitches together predictions from all folds to give a representative report
print("\nGenerating CV Classification Report...")
y_pred_cv = cross_val_predict(model_pipeline, X, y, cv=cv)
print(classification_report(y, y_pred_cv))

# 8. Final Training & Submission
print("\nRetraining model on the FULL dataset for final submission...")
model_pipeline.fit(X, y)

print("Predicting on Test set...")
test_preds = model_pipeline.predict(X_test)

submission = pd.DataFrame({
    'trip_id': test['trip_id'],
    'spend_category': test_preds
})

filename = 'submission_ensemble_cv.csv'
submission.to_csv(filename, index=False)
print(f"Submission saved to '{filename}'")

Loading data...
Engineering features...
Initializing Ensemble with Weights: [1, 3, 2]

Performing 5-Fold Stratified Cross-Validation...
CV Scores per fold: [0.74286846 0.75396197 0.75435816 0.74366086 0.76347068]
Mean CV Accuracy: 0.7517 (+/- 0.0153)

Generating CV Classification Report...
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      6245
           1       0.67      0.75      0.71      4911
           2       0.63      0.36      0.46      1464

    accuracy                           0.75     12620
   macro avg       0.71      0.65      0.67     12620
weighted avg       0.75      0.75      0.74     12620


Retraining model on the FULL dataset for final submission...
Predicting on Test set...
Submission saved to 'submission_ensemble_cv.csv'
