# Spaceship Titanic

## Data Cleaning and Pipeline

In [1005]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

train = pd.read_csv('/Users/adityajoshi/Documents/VisualStudioCode/VSC/Kaggle Comps/Spaceship Titanic/train-SpaceshipTitanic.csv')
df5 = pd.DataFrame(train)

temp = df5.copy()

# Split PassengerId into (group, pp) PassengerId is in the form "gggg_pp"
temp["group"], temp["pp"] = zip(*temp["PassengerId"].apply(lambda x: re.split(r'_+', x)))
temp["group"] = temp["group"].astype(int)
temp["pp"] = temp["pp"].astype(int)
temp.drop(columns=["PassengerId"], inplace=True)


# Impute Age with median and convert to int
temp["Age"] = temp["Age"].fillna(temp["Age"].median())
temp["Age"] = temp["Age"].round().astype(int)

#Fill missing Name with "FirstName LastName"
temp["Name"] = temp["Name"].fillna("FirstName LastName")

# Split Name into first_name and last_name Drop first_name; keep last_name 
temp['first_name'], temp['last_name'] = zip(*temp['Name'].apply(lambda x: re.split(r'\s+', x)))
temp.drop(columns=["Name", "first_name"], inplace=True)

# Replace NaN in service columns with 0
# We are going to assume that the NaN values in the service_cols implies that the passenger did not use the service. We will replace the NaN values with 0
service_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
for col in service_cols:
    temp[col] = temp[col].fillna(0)


# Split Cabin into Deck, Num, Side
temp["Deck"], temp["Num"], temp["Side"] = zip(*temp["Cabin"].fillna("Unknown/0/Unknown").apply(lambda x: re.split(r'/', x)))

# One Hot Encoding for Deck and Side
temp_pandas_encoded = pd.get_dummies(temp, columns=['Deck', 'Side'], drop_first=False) 
# Drop Cabin Column
temp_pandas_encoded.drop(columns=["Cabin"], inplace=True) 
temp = temp_pandas_encoded.copy()

# Replace NaN in Destination with HomePlanet
temp["Destination"] = temp["Destination"].fillna(temp["HomePlanet"])

# This qeury will return the rows where CryoSleep is null but the passenger used at least one service, so we assume they did not use CryoSleep. These Nan values will be replaced with False
q = temp.query("CryoSleep.isnull() & (RoomService != 0 | FoodCourt != 0 | ShoppingMall != 0 | Spa != 0 | VRDeck != 0)")
temp.loc[q.index, "CryoSleep"] = False


# This query will return the rows where CryoSleep is null and the passenger did not use any services, so we assume they used CryoSleep. These Nan values will be replaced with True
v = temp.query("CryoSleep.isnull() & RoomService == 0 & FoodCourt == 0 & ShoppingMall == 0 & Spa == 0 & VRDeck == 0")
temp.loc[v.index, "CryoSleep"] = True


temp.loc[temp["VIP"] == True, "Age"].describe() # Observing that 18 is the minimum age for VIP
temp.loc[temp["VIP"] == False, "Age"].describe()
# Replace NaN in VIP with False if Age < 18 
q1 = temp.query("Age < 18 & VIP.isnull()")
temp.loc[q1.index, "VIP"] = False



temp.loc[temp["VIP"] == True, "Deck_G"].describe() # Observing that are 0 VIP passengers with Deck_G == 1
temp.loc[temp["VIP"] == False, "Deck_G"].describe()
# Replace NaN in VIP with False if Deck_G != 0
q2 = temp.query("Deck_G != 0 & VIP.isnull()")
temp.loc[q2.index, "VIP"] = False


# If the absolute difference between the VIP.isna() passenger's row average service spending and the VIP True mean is less than the absolute difference between the VIP.isna() passenger's row average service spending and the VIP False mean, then the passenger is assumed to be a VIP
vip_true_mean = (temp.loc[temp["VIP"] == True, service_cols].mean(axis=1).mean())
vip_false_mean = (temp.loc[temp["VIP"] == False, service_cols].mean(axis=1).mean())
mask_missing = temp["VIP"].isna()
row_avg_spend = temp.loc[mask_missing, service_cols].mean(axis=1)
temp.loc[mask_missing, "VIP"] = ((row_avg_spend - vip_true_mean).abs() < (row_avg_spend - vip_false_mean).abs())



# Replace Nan in HomePlanet and Destination with "Unknown" and one hot encode 
temp["HomePlanet"] = temp["HomePlanet"].fillna("Unknown")
temp["Destination"] = temp["Destination"].fillna("Unknown")
temp_pandas_encoded = pd.get_dummies(temp, columns=['HomePlanet', 'Destination'], drop_first=False)
temp = temp_pandas_encoded.copy()

for col in temp.columns:
    if temp[col].dtype == bool:
        temp[col] = temp[col].astype(int)




temp.columns



Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'Transported', 'group', 'pp', 'last_name', 'Num',
       'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
       'Deck_T', 'Deck_Unknown', 'Side_P', 'Side_S', 'Side_Unknown',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_Unknown', 'Destination_55 Cancri e', 'Destination_Earth',
       'Destination_Europa', 'Destination_Mars', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'Destination_Unknown'],
      dtype='object')