In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# 1. Load Data
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Feature Engineering
def engineer_features(df):
    df = df.copy()
    
    # Create total companions
    df['num_females'] = df['num_females'].fillna(0)
    df['num_males'] = df['num_males'].fillna(0)
    df['total_people'] = df['num_females'] + df['num_males']
    
    # Create total stay duration
    df['total_nights'] = df['mainland_stay_nights'] + df['island_stay_nights']
    
    # Simplify high cardinality countries
    top_countries = df['country'].value_counts().nlargest(15).index
    df['country_grouped'] = df['country'].apply(lambda x: x if x in top_countries else 'Other')
    
    return df

print("Engineering features...")
train_eng = engineer_features(train)
test_eng = engineer_features(test)

# 3. Prepare X and y
# Drop rows where target is missing
train_eng = train_eng.dropna(subset=['spend_category'])
y = train_eng['spend_category'].astype(int)

# Drop irrelevant columns for X
X = train_eng.drop(['spend_category', 'trip_id', 'country'], axis=1)
X_test = test_eng.drop(['trip_id', 'country'], axis=1)

# 4. Preprocessing Pipeline
numeric_features = ['num_females', 'num_males', 'mainland_stay_nights', 'island_stay_nights', 'total_people', 'total_nights']
categorical_features = [col for col in X.columns if col not in numeric_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ])

# 5. Define SVM Model
# class_weight='balanced' is very important for this dataset to handle the rare class 2
model = SVC(kernel='rbf', C=1.0, class_weight='balanced', random_state=42)

# Create full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# 6. Validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training SVM...")
pipeline.fit(X_train, y_train)

print("Validating...")
val_preds = pipeline.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, val_preds):.4f}")
print(classification_report(y_val, val_preds))

# 7. Final Training & Submission
print("Retraining on full dataset...")
pipeline.fit(X, y)

print("Predicting on Test set...")
test_predictions = pipeline.predict(X_test)

submission = pd.DataFrame({
    'trip_id': test['trip_id'],
    'spend_category': test_predictions
})

submission.to_csv('submission_svm.csv', index=False)
print("Submission saved to 'submission_svm.csv'")

Loading data...
Engineering features...
Training SVM...
Validating...
Validation Accuracy: 0.7195
              precision    recall  f1-score   support

           0       0.84      0.83      0.84      1249
           1       0.68      0.58      0.63       982
           2       0.46      0.71      0.56       293

    accuracy                           0.72      2524
   macro avg       0.66      0.71      0.67      2524
weighted avg       0.73      0.72      0.72      2524

Retraining on full dataset...
Predicting on Test set...
Submission saved to 'submission_svm.csv'
