# Spaceship Titanic Classification w/ Support Vector Classifier

#### Dataset: https://www.kaggle.com/competitions/spaceship-titanic/overview
##### Dataset License: https://creativecommons.org/licenses/by/4.0/

###### Author: Cody Weaver

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.svm import SVC
from ssvm import SoftSVM
from kernel_ssvm import KSVM

### Load training set

In [2]:
train_set = pd.read_csv('../data/train.csv')
print(train_set.head())
print(train_set.dtypes)

# convert bool labels to -1, 1
def convert_labels(df, label_col='Transported'):
    return df[label_col].apply(lambda l: 1 if l else -1)

train_set['Transported'] = convert_labels(train_set)

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
Pa

### Classification using amenities data only

In [3]:
amenities_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

amenities_data = train_set.copy()

def normalize_amenity_data(df, col_names):
    # normalize to mean zero and unit variance
    for column in amenities_columns:
        column_mean = amenities_data[column].mean()
        column_std = amenities_data[column].std()
        normalize = lambda x: (x - column_mean) / column_std
        amenities_data[column] = normalize(amenities_data[column])

    # fill in missing values for amenities
    amenities_data[amenities_columns] = amenities_data[amenities_columns].fillna(0)
    
    return amenities_data[amenities_columns]

amenities_data[amenities_columns] = normalize_amenity_data(train_set, amenities_columns)

### Cross-Validation

In [4]:
model = SVC()

X = amenities_data[amenities_columns].to_numpy()
y = amenities_data['Transported'].to_numpy()

print(cross_val_score(model, X, y, cv=5).mean())

0.7882230637953772


# Classification using more features

In [5]:
def normalize_numerical_feats(df, cols):
    numerical_cols = df[cols].copy()

    for col in cols:
        mean = numerical_cols[col].mean()
        std = numerical_cols[col].std()
        
        normalize = lambda x: (x - mean ) / std
        numerical_cols[col] = numerical_cols[col].apply(normalize)

    # fill in missing values with mean (0)
    numerical_cols[cols].fillna(0)

    return numerical_cols

def convert_bool_feats(df, cols):
    boolean_cols = df[cols].copy()

    convert_bool = lambda x: 1 if x else 0

    for col in cols:
        boolean_cols[col] = boolean_cols[col].apply(convert_bool)

    # fill in missing values with 0
    boolean_cols.fillna(0)

    return boolean_cols

def prepare_dataset(dataset, training):
    if training:
        data = pd.DataFrame(dataset[['PassengerId', 'Transported']])
    else:
        data = pd.DataFrame(dataset[['PassengerId']])

    # normalize continuous numerical features
    NUMERICAL_FEATS = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    data[NUMERICAL_FEATS] = normalize_numerical_feats(dataset, NUMERICAL_FEATS)
    # convert boolean values to 0, 1
    BOOLEAN_FEATS = ['CryoSleep', 'VIP']
    data[BOOLEAN_FEATS] = convert_bool_feats(dataset, BOOLEAN_FEATS)

    # convert nominal feats to numerical feats
    # home planet
    home_planet_dummies = pd.get_dummies(dataset['HomePlanet'], dtype=np.float64)
    home_planet_dummies.fillna(0)
    data[home_planet_dummies.columns] = home_planet_dummies

    # destination
    destination_dummies = pd.get_dummies(dataset['Destination'], dtype=np.float64)
    destination_dummies.fillna(0)
    data[destination_dummies.columns] = destination_dummies

    # cabin (exclude cabin numbers)
    cabin = dataset['Cabin']
    cabin = cabin.apply(lambda cabin: str(cabin).split('/'))
    cabin = pd.DataFrame(
        cabin.to_list(), 
        index=cabin.index,
        columns=['Deck', 'CabinNum', 'Side'])

    deck = cabin['Deck']
    side = cabin['Side']
    
    deck_dummies = pd.get_dummies(deck, dtype=np.float64)
    deck_dummies = deck_dummies.drop('nan', axis=1)
    deck_dummies.fillna(0)
    data[deck_dummies.columns] = deck_dummies

    side_dummies = pd.get_dummies(side, dtype=np.float64)
    side_dummies.fillna(0)
    data[side_dummies.columns] = side_dummies

    data = data.fillna(0)

    return data

training_data = prepare_dataset(train_set, training=True)

# Cross-Validation

In [6]:
model = SVC()

X = training_data[training_data.columns.difference(['Transported', 'PassengerId'])].to_numpy()
y = training_data['Transported'].to_numpy()

print(cross_val_score(model, X, y, cv=5))

[0.77918344 0.79700978 0.79758482 0.81012658 0.80379747]


# Predicting on Test Set

Reaches around 80% accuracy on test set

In [7]:
# load raw test dataset
test_set = pd.read_csv('../data/test.csv')

test_data = prepare_dataset(test_set, training=False)

X_test = test_data[test_data.columns.difference(['PassengerId'])].to_numpy()

model = SVC()
model.fit(X, y)

predictions = model.predict(X_test)
predictions = pd.DataFrame(predictions.tolist(), columns=['Transported'])

submission_data = pd.concat([test_set['PassengerId'].copy(), predictions], axis=1)
submission_data['Transported'] = submission_data['Transported'].apply(lambda l: True if l == 1 else False)
submission_data.to_csv('../submissions/svc_03-21.csv', index=False)

### Custom SVM

In [8]:
X = training_data[training_data.columns.difference(['Transported', 'PassengerId'])].to_numpy()
y = training_data['Transported'].to_numpy()

def cross_validate(X, Y, num_folds, ModelClass):
    cv_scores = []
    X = X
    Y = Y

    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=101)

    for i, (train_idx, test_idx) in enumerate(kfold.split(X, Y)):
        X_train = X[train_idx]
        Y_train = Y[train_idx]

        X_test = X[test_idx]
        Y_test = Y[test_idx]

        model = ModelClass()
        model.fit(X_train, Y_train, lamb=1e-1, num_iters=10000)

        scores = model.evaluate(X_test, Y_test)
        cv_scores.append(scores)

    acc = np.mean(cv_scores)
    print('Cross-Validated Scores - acc: %.2f' % acc)

cross_validate(X, y, 5, SoftSVM)

Cross-Validated Scores - acc: 0.78


### Predict on test set

Reaches about 77% accuracy on test set

In [9]:
model = SoftSVM()
model.fit(X, y)

predictions = model.predict(X_test)
predictions = pd.DataFrame(predictions.tolist(), columns=['Transported'])

submission_data = pd.concat([test_set['PassengerId'].copy(), predictions], axis=1)
submission_data['Transported'] = submission_data['Transported'].apply(lambda l: True if l == 1 else False)
submission_data.to_csv('../submissions/ssvm_03-21.csv', index=False)

In [10]:
cross_validate(X, y, 5, KSVM)

Cross-Validated Scores - acc: 0.73


In [11]:
model = KSVM()
model.fit(X, y, num_iters=10000)

predictions = model.predict(X_test)
predictions = pd.DataFrame(predictions.tolist(), columns=['Transported'])

submission_data = pd.concat([test_set['PassengerId'].copy(), predictions], axis=1)
submission_data['Transported'] = submission_data['Transported'].apply(lambda l: True if l == 1 else False)
submission_data.to_csv('../submissions/ksvm.csv', index=False)