In [1]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
# Load data

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [4]:
def split_cols(df):
    ''''''
    df['PassengerGroup'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
    df['PassengerNum'] = df['PassengerId'].apply(lambda x: x.split('_')[1])
    df['CabinDeck'] = df['Cabin'].apply(lambda x: str(x).split('/')[0] if bool(re.search('[A-Za-z]/[0-9]+/[A-Za-z]', str(x))) else x)
    df['CabinNum'] = df['Cabin'].apply(lambda x: str(x).split('/')[1] if bool(re.search('[A-Za-z]/[0-9]+/[A-Za-z]', str(x))) else x)
    df['CabinSide'] = df['Cabin'].apply(lambda x: str(x).split('/')[2] if bool(re.search('[A-Za-z]/[0-9]+/[A-Za-z]', str(x))) else x)
    df['FirstName'] = df['Name'].apply(lambda x: str(x).split()[0] if len(str(x)) > 0 else x)
    df['LastName'] = df['Name'].apply(lambda x: str(x).split()[1] if len(str(x).split()) > 1 else x)
    df['TotalSpend'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck'] 
    
    return df


In [5]:
df = split_cols(train.copy())
train = split_cols(train)

In [6]:
def fill_cryo_na(df):
    '''Replaces NaN CryoSleep for passengers with spending > 0 with False'''
    spend_cols = [
        'RoomService',
        'FoodCourt',
        'ShoppingMall',
        'Spa',
        'VRDeck']

    # Loop through each spending column, if any spending is found, change CryoSleep to False
    # Otherwise, keep CryoSleep as is
    for col in spend_cols:
        df['CryoSleep'] = np.where(df['CryoSleep'].isnull() & df[col] > 0, False, df['CryoSleep'])
    

In [7]:
fill_cryo_na(df)

# After filling, number of null CryoSleeps
print("CryoSleep NA filled:", len(train[train['CryoSleep'].isnull()]) - len(df[df['CryoSleep'].isnull()]))

CryoSleep NA filled: 119


In [15]:
def fillna_planet(df):
    '''
    Fills home planet based on EDA conclusions:
    
    1. Only Europa passengers are ony decks A, B, C, or T
    2. Only those on deck G are from Earth
    3. High majority of those with PSO J318.5-22 destination are from Earth (89%)
    
    Arg: takes dataframe
    Returns: updated dataframe
    
    '''
    
    # Decks A, B, C, T only have Europa passengers
    # Deck D has Mars/Europa only
    # Deck E has all planets
    # Deck F has Earth/Mars
    # People in deck G are all from Earth
    
    home_planets = ['Earth', 'Europa', 'Mars']
    europa_decks = ['A', 'B', 'C', 'T']
    planets = []

    for i in range(len(df)):
        # First if checks for null value
        if df.iloc[i]['CabinDeck'] not in home_planets:
            
            # Check for planet-specific decks
            if df.iloc[i]['CabinDeck'] in europa_decks:
                planets.append('Europa')
            elif df.iloc[i]['CabinDeck'] == 'G':
                planets.append('Earth')
            elif df.iloc[i]['Destination'] == 'PSO J318.5-22' and df.iloc[i]['CabinDeck'] != 'D':
                planets.append('Earth')
            else:
                planets.append(df.iloc[i]['HomePlanet'])
                # planets.append('Unknown')
        else:
            planets.append(df.iloc[i]['HomePlanet'])
    
    df['HomePlanet'] = planets
    
    return df

In [9]:
df = fillna_planet(df)

print("Home Planet NA filled:", len(train[train['HomePlanet'].isnull()]) - len(df[df['HomePlanet'].isnull()]))

Home Planet NA filled: 97


In [10]:
# Throws truth value of a Series is ambiguous error
# train['HomePlanet'] = np.where(
#   (train['HomePlanet'].isnull()) and (train['CabinDeck'] == 'T'), 'Europa', df['CabinDeck'])

In [11]:
def fillna_cabin(df):
    '''Fills cabin nan for Earth sleepers'''
    
    cabins = []
    cabin_decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']

    for i in range(len(df)):
        cabin = df.iloc[i]['CabinDeck']
        
        # First if checks for null value
        if cabin not in cabin_decks:
            if df.iloc[i]['HomePlanet'] == 'Earth' and df.iloc[i]['CryoSleep'] == True:
                cabins.append('G')
            else:
                cabins.append(df.iloc[i]['CabinDeck'])
        else:
                cabins.append(df.iloc[i]['CabinDeck'])
    
    df['CabinDeck'] = cabins
    
    return df

In [12]:
df = fillna_cabin(df)

print("Cabin deck NA filled:", len(train[train['CabinDeck'].isnull()]) - len(df[df['CabinDeck'].isnull()]))

Cabin deck NA filled: 36


In [13]:
print("Total NA filled:", len(train[train.isnull().T.any()]) - len(df[df.isnull().T.any()]))
print("Total % NA filled:", (len(train[train.isnull().T.any()]) - len(df[df.isnull().T.any()])) / len(train[train.isnull().T.any()]))

Total NA filled: 174
Total % NA filled: 0.08337326305701964


In [14]:
df[df.isnull().T.any()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,Transported,PassengerGroup,PassengerNum,CabinDeck,CabinNum,CabinSide,FirstName,LastName,TotalSpend
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,...,Candra Jacostaffey,True,0006,02,G,0,S,Candra,Jacostaffey,
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,...,Altardr Flatic,True,0008,02,B,1,P,Altardr,Flatic,
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,...,Justie Pooles,False,0012,01,,,,Justie,Pooles,908.0
16,0014_01,Mars,False,F/3/P,55 Cancri e,27.0,False,1286.0,122.0,,...,Flats Eccle,False,0014,01,F,3,P,Flats,Eccle,
23,0020_03,Earth,True,E/0/S,55 Cancri e,29.0,False,0.0,0.0,,...,Mollen Mcfaddennon,False,0020,03,E,0,S,Mollen,Mcfaddennon,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8665,9247_01,Earth,True,G/1491/S,TRAPPIST-1e,33.0,False,0.0,0.0,0.0,...,Phia Cleang,False,9247,01,G,1491,S,Phia,Cleang,
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,...,Gian Perle,True,9248,01,F,1792,S,Gian,Perle,2416.0
8667,9250_01,Europa,False,E/597/P,TRAPPIST-1e,29.0,False,0.0,2972.0,,...,Chain Reedectied,True,9250,01,E,597,P,Chain,Reedectied,
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,...,Ties Apple,False,9257,01,F,1892,P,Ties,Apple,1148.0
