# Spaceship Titanic Classification w/ Binary Logistic Regression

#### Dataset: https://www.kaggle.com/competitions/spaceship-titanic/overview
##### Dataset License: https://creativecommons.org/licenses/by/4.0/

###### Author: Cody Weaver

### Load and Process Dataset

In [1]:
import pandas as pd
import numpy as np
from lr import LogisticRegressionModel
from sklearn.model_selection import KFold

### Training Set

In [2]:
# load raw train dataset
train_set = pd.read_csv('../data/train.csv')
print(train_set.head())
print(train_set.dtypes)

# convert bool labels to 0-1
def convert_labels(df, label_col='Transported'):
    return df[label_col].apply(lambda l: 1 if l else 0)

train_set['Transported'] = convert_labels(train_set)

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
Pa

### Classification using amount billed for amenities only

#### Process and Normalize data

In [3]:
amenities_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

amenities_data = train_set.copy()

def normalize_amenity_data(df, col_names):
    # normalize to mean zero and unit variance
    for column in amenities_columns:
        column_mean = amenities_data[column].mean()
        column_std = amenities_data[column].std()
        normalize = lambda x: (x - column_mean) / column_std
        amenities_data[column] = normalize(amenities_data[column])

    # fill in missing values for amenities
    amenities_data[amenities_columns] = amenities_data[amenities_columns].fillna(0)
    
    return amenities_data[amenities_columns]

amenities_data[amenities_columns] = normalize_amenity_data(train_set, amenities_columns)

#### Train Model and validate model on amenities dataset

In [4]:
num_epochs = 5
num_folds = 5

def cross_validate(X, Y, num_epochs, num_folds, ModelClass):
    # shuffle data
    X = X.sample(frac=1).reset_index(drop=True)

    cv_scores = []
    X = X.to_numpy()
    Y = Y.to_numpy()

    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=101)

    for i, (train_idx, test_idx) in enumerate(kfold.split(X, Y)):
        X_train = X[train_idx]
        Y_train = Y[train_idx]

        X_test = X[test_idx]
        Y_test = Y[test_idx]

        model = ModelClass(silent=True)
        model.fit(
            X_train,
            Y_train,
            max_iter=100,
        )

        scores = model.evaluate(X_test, Y_test)
        cv_scores.append(scores)

    acc = np.mean(cv_scores)
    print('Cross-Validated Scores - acc: %.2f' % acc)

cross_validate(
    amenities_data[amenities_columns],
    amenities_data['Transported'],
    num_epochs,
    num_folds,
    LogisticRegressionModel
)

Cross-Validated Scores - acc: 0.49


Overall, using only the amenities billing data performs poorly.

# Logistic Regression

Logistic Regression using every feature, except for PassengerId, Name, and CabinNum.

In [5]:
def normalize_numerical_feats(df, cols):
    numerical_cols = df[cols].copy()

    for col in cols:
        mean = numerical_cols[col].mean()
        std = numerical_cols[col].std()
        
        normalize = lambda x: (x - mean ) / std
        numerical_cols[col] = numerical_cols[col].apply(normalize)

    # fill in missing values with mean (0)
    numerical_cols[cols].fillna(0)

    return numerical_cols

def convert_bool_feats(df, cols):
    boolean_cols = df[cols].copy()

    convert_bool = lambda x: 1 if x else 0

    for col in cols:
        boolean_cols[col] = boolean_cols[col].apply(convert_bool)

    # fill in missing values with 0
    boolean_cols.fillna(0)

    return boolean_cols

def prepare_dataset(dataset, training):
    if training:
        data = pd.DataFrame(dataset[['PassengerId', 'Transported']])
    else:
        data = pd.DataFrame(dataset[['PassengerId']])

    # normalize continuous numerical features
    NUMERICAL_FEATS = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    data[NUMERICAL_FEATS] = normalize_numerical_feats(dataset, NUMERICAL_FEATS)
    # convert boolean values to 0, 1
    BOOLEAN_FEATS = ['CryoSleep', 'VIP']
    data[BOOLEAN_FEATS] = convert_bool_feats(dataset, BOOLEAN_FEATS)

    # convert nominal feats to numerical feats
    # home planet
    home_planet_dummies = pd.get_dummies(dataset['HomePlanet'], dtype=np.float64)
    home_planet_dummies.fillna(0)
    data[home_planet_dummies.columns] = home_planet_dummies

    # destination
    destination_dummies = pd.get_dummies(dataset['Destination'], dtype=np.float64)
    destination_dummies.fillna(0)
    data[destination_dummies.columns] = destination_dummies

    # cabin (exclude cabin numbers)
    cabin = dataset['Cabin']
    cabin = cabin.apply(lambda cabin: str(cabin).split('/'))
    cabin = pd.DataFrame(
        cabin.to_list(), 
        index=cabin.index,
        columns=['Deck', 'CabinNum', 'Side'])

    deck = cabin['Deck']
    side = cabin['Side']
    
    deck_dummies = pd.get_dummies(deck, dtype=np.float64)
    deck_dummies = deck_dummies.drop('nan', axis=1)
    deck_dummies.fillna(0)
    data[deck_dummies.columns] = deck_dummies

    side_dummies = pd.get_dummies(side, dtype=np.float64)
    side_dummies.fillna(0)
    data[side_dummies.columns] = side_dummies

    data = data.fillna(0)

    return data

data = prepare_dataset(train_set, training=True)

### Train Model

In [6]:
# feature column names
feature_data = data[data.columns.difference([
    'PassengerId', 'Transported'
])]

model_dim = len(feature_data.columns)

model = LogisticRegressionModel(model_dim, silent=True)
model.fit(feature_data.to_numpy(), data['Transported'].to_numpy(), max_iter=20, fit_bias=True)
print(model.evaluate(feature_data.to_numpy(), data['Transported'].to_numpy()))

print(model.weights)
print(model.bias)

0.7584263200276085
[ 4.19499253e-01 -7.46235699e-01 -4.30652883e-01  1.24216340e+00
  2.40729714e+00  1.14349414e+01 -5.65235063e-01 -3.20035933e+00
 -6.81521356e+00  2.99014566e+00 -1.70940782e+00  3.99821677e+00
 -6.33264513e-01  5.24339222e-01 -3.23621736e+00 -6.89090295e-01
 -4.90770202e+00 -6.62454379e-04  1.91331827e+00 -8.03864238e+00
 -3.18379268e-02 -3.41063379e+00 -3.07327091e-01 -7.40169695e+00]
-3.785819188691545


### Predict on test set

In [7]:
# load raw test dataset
test_set = pd.read_csv('../data/test.csv')

test_data = prepare_dataset(test_set, training=False)

test_features = test_data[test_data.columns.difference(['PassengerId'])]

predictions = model.predict(test_features.to_numpy())
predictions = pd.DataFrame(predictions.tolist(), columns=['Transported'])

submission_data = pd.concat([test_set['PassengerId'].copy(), predictions], axis=1)
submission_data['Transported'] = submission_data['Transported'].apply(lambda l: True if l else False)
submission_data.to_csv('../submissions/lr_03-20.csv', index=False)