### Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spaceship-titanic/train.csv')

### Basic Feature Engineering

In [3]:
def process_dataframe(df):   
    df['CryoSleep'] = df.CryoSleep.replace({True: 1, False: 0})
    df['VIP'] = df.VIP.replace({True: 1, False: 0})
    df['CabinDeck'] = df.Cabin.str.split('/').str.get(0)
    df['CabinNum'] = df.Cabin.str.split('/').str.get(1)
    df['CabinSide'] = df.Cabin.str.split('/').str.get(2)
    return df

In [4]:
df = process_dataframe(df)

In [5]:
def select_target_and_features(df, is_train_data=True):
       X = df[['HomePlanet', 'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 
              'Spa', 'VRDeck', 'CabinSide']]
       X = pd.get_dummies(X, drop_first=True)

       if is_train_data:
              y = df['Transported']
              return X, y
       else:
              return X

In [6]:
X, y = select_target_and_features(df)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=99)

### Handling the missing values

In [9]:
from sklearn.impute import SimpleImputer

In [10]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [11]:
X_train = imp.fit_transform(X_train)
X_val = imp.transform(X_val)

### Building out a model

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
rf = RandomForestClassifier()

In [14]:
rf.fit(X_train, y_train)

In [15]:
y_pred = rf.predict(X_val)

### Evaluating performance

In [16]:
from sklearn.metrics import classification_report

In [17]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

       False       0.78      0.81      0.79       854
        True       0.81      0.78      0.79       885

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [18]:
rf.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

### Preparing the Test Dataset

In [19]:
df_test = pd.read_csv('spaceship-titanic/test.csv')

In [21]:
df_test = process_dataframe(df_test)

In [23]:
X_test = select_target_and_features(df_test, is_train_data=False)

In [27]:
X_test = imp.transform(X_test)

### Predicting on the Test Data

In [36]:
test_preds_array = rf.predict(X_test)

### Creating the Submission Data

In [41]:
submission_df = pd.concat([df_test.PassengerId, pd.Series(test_preds_array)], axis=1)

In [45]:
submission_df = submission_df.rename(columns={0:'Transported'})

In [46]:
submission_df.to_csv('results.csv', index=False)

In [47]:
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
