In [2]:
# Import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load data

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# Training columns
train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [6]:
# Store columns
features = test.columns
labels = train['Transported']

In [7]:
print('Training size:', len(train))
print('Test size:', len(test))

Training size: 8693
Test size: 4277


In [8]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [33]:
train.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [41]:
train.isnull().values.sum()

2324

In [9]:
# Check which rows have null values
nan_rows = train[train.isnull().T.any()]

nan_rows

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,0.0,0.0,Altardr Flatic,True
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False
16,0014_01,Mars,False,F/3/P,55 Cancri e,27.0,False,1286.0,122.0,,0.0,0.0,Flats Eccle,False
23,0020_03,Earth,True,E/0/S,55 Cancri e,29.0,False,0.0,0.0,,0.0,0.0,Mollen Mcfaddennon,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8667,9250_01,Europa,False,E/597/P,TRAPPIST-1e,29.0,False,0.0,2972.0,,28.0,188.0,Chain Reedectied,True
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False
8675,9259_01,Earth,,F/1893/P,TRAPPIST-1e,44.0,False,1030.0,1015.0,0.0,11.0,,Annah Gilleyons,True
8684,9274_01,,True,G/1508/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True


## PassengerId - no nulls

In [12]:
nan_rows[nan_rows['PassengerId'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported


## HomePlanet - 201 rows with null value

Possible to guess by destination?

* TRAPPIST-1e = most common home planet is Earth (52% of passengers with same destination), then split somewhat evenly between Mars and Europa
* 55 Cancri e = most common home planet is Europa (49.2% of passengers with same destination), then Earth, then Mars
* PSO J318.5-22 = most common home planet is Earth (89% of passengers with same destination)

Maybe guess by family member / travelling partner:
* Shared first numbers in passenger_id - should have a match if PassengerID ends with _02+

Otherwise: input a value that indicates missing info?



In [29]:
train[train['PassengerId'] == '0064_01']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
58,0064_01,Mars,True,F/14/S,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,,True


In [13]:
nan_rows[nan_rows['HomePlanet'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
59,0064_02,,True,E/3/S,TRAPPIST-1e,33.0,False,0.0,0.0,,0.0,0.0,Colatz Keen,True
113,0119_01,,False,A/0/P,TRAPPIST-1e,39.0,False,0.0,2344.0,0.0,65.0,6898.0,Batan Coning,False
186,0210_01,,True,D/6/P,55 Cancri e,24.0,False,0.0,0.0,,0.0,0.0,Arraid Inicont,True
225,0242_01,,False,F/46/S,TRAPPIST-1e,18.0,False,313.0,1.0,691.0,283.0,0.0,Almone Sté,False
234,0251_01,,True,C/11/S,55 Cancri e,54.0,False,0.0,0.0,0.0,0.0,0.0,Diphah Amsive,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8515,9084_01,,False,E/582/P,TRAPPIST-1e,25.0,False,1258.0,0.0,22.0,19.0,0.0,Jurs Mone,False
8613,9194_01,,False,E/603/S,55 Cancri e,53.0,False,0.0,4017.0,0.0,13.0,3147.0,,False
8666,9248_01,,False,F/1792/S,55 Cancri e,38.0,,28.0,1208.0,973.0,207.0,0.0,Gian Perle,True
8674,9257_01,,False,F/1892/P,TRAPPIST-1e,13.0,False,39.0,0.0,1085.0,24.0,0.0,Ties Apple,False


In [24]:
nan_rows['Destination'][nan_rows['HomePlanet'].isnull()].value_counts()

TRAPPIST-1e      150
55 Cancri e       31
PSO J318.5-22     16
Name: Destination, dtype: int64

In [22]:
train['HomePlanet'].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [23]:
train['Destination'].value_counts()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

In [15]:
train['HomePlanet'][train['Destination'] == 'TRAPPIST-1e'].value_counts()

Earth     3101
Mars      1475
Europa    1189
Name: HomePlanet, dtype: int64

In [16]:
train['HomePlanet'][train['Destination'] == '55 Cancri e'].value_counts()

Europa    886
Earth     690
Mars      193
Name: HomePlanet, dtype: int64

In [17]:
train['HomePlanet'][train['Destination'] == 'PSO J318.5-22'].value_counts()

Earth     712
Mars       49
Europa     19
Name: HomePlanet, dtype: int64

## CryoSleep - 217 rows with null value

* Anyone with any value in VIP, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck cannot have been in Cryo sleep - mark these people true
* Cannot necessarily do the same for reverse (87 rows with 0.0 values for all columns) - spending is not high even for awake passengers

In [55]:
nan_rows[nan_rows['CryoSleep'].isnull()].sort_values('FoodCourt')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
92,0099_02,Earth,,G/12/P,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,0.0,0.0,Thewis Connelson,True
5270,5625_02,Mars,,E/352/P,PSO J318.5-22,49.0,False,0.0,0.0,0.0,0.0,0.0,Cros Nane,False
5388,5756_04,Earth,,F/1194/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,,False
5531,5897_01,Europa,,C/221/S,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,,True
5588,5952_01,Mars,,F/1134/S,55 Cancri e,22.0,False,0.0,0.0,0.0,0.0,0.0,Rants Giba,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7896,8431_02,Europa,,B/277/P,55 Cancri e,26.0,False,1.0,4680.0,25.0,564.0,13.0,Zinoces Myling,False
1770,1882_01,Europa,,C/71/S,55 Cancri e,18.0,False,0.0,16263.0,72.0,0.0,15.0,Magnon Antcal,True
3232,3476_01,Earth,,G/571/P,55 Cancri e,4.0,False,0.0,,0.0,0.0,0.0,Gabrie Joycerton,True
5370,5736_01,Earth,,F/1189/P,TRAPPIST-1e,23.0,False,1.0,,1.0,0.0,618.0,Ton Maynardner,False


In [36]:
# People in cryo sleep cannot spend money on ship services
train[train['CryoSleep'] == True].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,2955.0,2969.0,2967.0,2941.0,2972.0,2975.0
mean,27.405415,0.0,0.0,0.0,0.0,0.0
std,15.080469,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,18.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,0.0,0.0,0.0,0.0,0.0
max,78.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# Spending is still low even if you weren't in cryo sleep
train[train['CryoSleep'] == False].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,5343.0,5328.0,5329.0,5335.0,5326.0,5320.0
mean,29.651319,350.146772,713.004316,270.586504,486.09294,475.716165
std,14.07554,803.08032,1970.547985,741.756155,1396.233751,1404.174304
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,3.0,5.0,2.0,7.0,4.0
75%,38.0,390.25,537.0,242.0,354.75,356.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [38]:
# There are $0 spenders even if awake
train[train['CryoSleep'] == False]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8686,9275_02,Europa,False,A/97/P,TRAPPIST-1e,32.0,False,1.0,1146.0,0.0,50.0,34.0,Diram Conable,False
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [64]:
# There are 87 rows with no spending and null CryoSleep
nan_rows[nan_rows['CryoSleep'].isnull()][nan_rows['RoomService'] == 0][nan_rows['FoodCourt'] == 0][nan_rows['ShoppingMall'] == 0][nan_rows['Spa'] == 0][nan_rows['VRDeck'] == 0]

  nan_rows[nan_rows['CryoSleep'].isnull()][nan_rows['RoomService'] == 0][nan_rows['FoodCourt'] == 0][nan_rows['ShoppingMall'] == 0][nan_rows['Spa'] == 0][nan_rows['VRDeck'] == 0]


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
92,0099_02,Earth,,G/12/P,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,0.0,0.0,Thewis Connelson,True
175,0198_01,Earth,,G/30/P,PSO J318.5-22,52.0,False,0.0,0.0,0.0,0.0,0.0,Jeroy Cookson,True
266,0290_03,Europa,,B/7/S,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Dhenar Excialing,True
392,0433_01,Europa,,B/20/P,55 Cancri e,27.0,False,0.0,0.0,0.0,0.0,0.0,Hekark Mormonized,True
626,0653_01,Mars,,E/45/S,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Rionk Paska,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8501,9076_01,Earth,,G/1461/S,TRAPPIST-1e,32.0,False,0.0,0.0,0.0,0.0,0.0,Shawne Simonsents,False
8612,9193_01,Earth,,G/1483/S,PSO J318.5-22,35.0,False,0.0,0.0,0.0,0.0,0.0,Camily Howence,False
8620,9197_01,Europa,,C/308/P,55 Cancri e,44.0,False,0.0,0.0,0.0,0.0,0.0,Bellus Platch,True
8651,9227_05,Earth,,G/1498/P,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,Hard Hinglendez,False


## Cabin - 199 rows with null values

* Doubt there will be any contextual clues other than if they were a paired or grouped passenger, ID ends with _02+
* Check passenger id families and assign same cabin number for at least general location, if cabin location is important for transported outcome

In [65]:
nan_rows[nan_rows['Cabin'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,Naosura Motled,False
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True
8509,9081_03,Earth,True,,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Beula Clemondsey,False


## Destination - 182 rows with null values

Probably take similar approach to home planet:
* There is correlation between planet and destination
* Grouped passengers likely have the same destination
* Add an value representing unknown

In [66]:
nan_rows[nan_rows['Destination'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
47,0045_02,Mars,True,F/10/P,,19.0,False,0.0,0.0,0.0,0.0,0.0,Mass Chmad,True
128,0138_02,Earth,False,E/5/P,,34.0,False,0.0,22.0,0.0,564.0,207.0,Monah Gambs,False
139,0152_01,Earth,False,F/32/P,,41.0,False,0.0,0.0,0.0,0.0,607.0,Andan Estron,False
347,0382_01,,False,G/64/P,,23.0,False,348.0,0.0,0.0,4.0,368.0,Blanie Floydendley,False
430,0462_01,Earth,True,G/67/S,,50.0,False,0.0,0.0,0.0,0.0,0.0,Ronia Sosanturney,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8372,8956_02,Earth,True,G/1453/P,,20.0,False,0.0,0.0,0.0,0.0,0.0,Erina Bonnondry,True
8551,9130_01,Mars,True,F/1765/S,,41.0,False,0.0,0.0,0.0,0.0,0.0,Blers Corte,True
8616,9195_02,Mars,True,F/1779/S,,33.0,False,0.0,0.0,0.0,0.0,0.0,Ceros Purle,True
8621,9197_02,Europa,False,C/308/P,,41.0,True,0.0,7964.0,0.0,3238.0,5839.0,Aludram Platch,False


## Age - 179 rows with null values

Assign value representing unknown - there are likely no contextual clues that would give us an exact age
* grouped passengers may be similar age, but maybe not

In [67]:
nan_rows[nan_rows['Age'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
50,0052_01,Earth,False,G/6/S,TRAPPIST-1e,,False,4.0,0.0,2.0,4683.0,0.0,Elaney Hubbarton,False
64,0068_01,Mars,False,E/4/S,TRAPPIST-1e,,False,793.0,0.0,2.0,253.0,0.0,Cinst Binie,False
137,0149_01,Earth,True,G/27/S,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Billya Hubbarrison,True
181,0202_02,Europa,False,A/2/P,55 Cancri e,,False,0.0,2433.0,,878.0,443.0,Vegas Embleng,True
184,0206_01,Europa,False,C/9/S,55 Cancri e,,False,2.0,1720.0,12.0,1125.0,122.0,Nuson Brugashed,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8274,8835_01,Earth,True,G/1425/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Shalle Bartines,False
8301,8862_03,Europa,True,C/329/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Alchib Myling,True
8374,8956_04,Earth,False,G/1453/P,TRAPPIST-1e,,False,194.0,1.0,10.0,629.0,0.0,Krisa Bonnondry,False
8407,8988_01,Earth,True,G/1448/S,TRAPPIST-1e,,False,0.0,0.0,0.0,0.0,0.0,Maen Fowlesterez,True


## VIP - 203 rows with null values

VIPs are not very common: 2.34% of training dataset
* Might be able to just fill in as False, unless feature is important

VIPs have different spending patterns:
* More likely to be awake (roughly 10% of VIPs are in cryo sleep vs. 35% of regular passengers)
* Very unlikely to have no spending
* Could set a threshold in spending, like 1000 in a category, and then assign VIP or not based on threshold

Could also verify by grouped passengers?

In [68]:
nan_rows[nan_rows['VIP'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
38,0036_01,Earth,False,F/8/S,55 Cancri e,15.0,,0.0,492.0,48.0,20.0,401.0,Marina Leodger,False
102,0108_03,Earth,False,G/19/S,TRAPPIST-1e,0.0,,0.0,0.0,0.0,0.0,0.0,Oline Handertiz,True
145,0165_01,Mars,True,F/37/P,TRAPPIST-1e,35.0,,0.0,0.0,0.0,0.0,0.0,Graven Anche,True
228,0244_02,Mars,True,F/47/S,55 Cancri e,14.0,,0.0,0.0,0.0,0.0,0.0,Tous Sad,True
566,0593_01,Mars,False,D/24/P,TRAPPIST-1e,,,43.0,152.0,182.0,1.0,2005.0,Hon Kra,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8494,9074_01,Earth,True,G/1460/S,TRAPPIST-1e,0.0,,0.0,0.0,,0.0,0.0,Adamie Trerady,True
8512,9081_06,Earth,False,F/1858/P,PSO J318.5-22,16.0,,0.0,0.0,761.0,0.0,0.0,Daryla Clemondsey,False
8542,9122_01,Earth,True,G/1469/S,55 Cancri e,55.0,,0.0,0.0,0.0,0.0,0.0,Paulas Schmondez,False
8630,9205_03,Europa,True,B/300/P,TRAPPIST-1e,52.0,,0.0,0.0,0.0,0.0,0.0,Propent Brakeng,True


In [69]:
train[train['VIP'] == True].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,198.0,195.0,193.0,194.0,197.0,195.0
mean,37.449495,473.615385,1811.393782,247.726804,760.71066,1234.85641
std,11.611957,1098.363713,3758.336442,587.449429,1622.153873,2453.060817
min,18.0,0.0,0.0,0.0,0.0,0.0
25%,29.0,0.0,0.0,0.0,0.0,0.0
50%,34.0,0.0,287.0,0.0,39.0,30.0
75%,44.0,528.5,2191.0,161.5,843.0,1377.5
max,73.0,8243.0,29813.0,3700.0,13902.0,12424.0


In [70]:
train[train['VIP'] == False].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8119.0,8118.0,8118.0,8092.0,8112.0,8108.0
mean,28.639611,217.218527,426.336536,173.876298,301.711045,282.718056
std,14.469895,644.922614,1505.278247,610.900749,1127.555366,1084.67823
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,42.0,56.75,26.0,52.0,40.0
max,79.0,14327.0,27071.0,23492.0,22408.0,24133.0


In [75]:
train[train['VIP'] == True][train['CryoSleep'] == False]

  train[train['VIP'] == True][train['CryoSleep'] == False]


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
108,0112_01,Europa,False,B/1/S,55 Cancri e,48.0,True,0.0,2537.0,87.0,17.0,13.0,Moth Cowtale,True
120,0128_01,Mars,False,D/3/S,TRAPPIST-1e,61.0,True,2353.0,334.0,9.0,316.0,2.0,Grohs Fles,False
214,0224_01,Mars,False,F/42/S,TRAPPIST-1e,32.0,True,181.0,0.0,5.0,1634.0,0.0,Blues Queen,False
291,0321_01,,False,F/61/S,TRAPPIST-1e,59.0,True,1018.0,0.0,209.0,0.0,0.0,Quites Bache,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False
8614,9194_02,Europa,False,E/603/S,TRAPPIST-1e,32.0,True,1003.0,909.0,0.0,0.0,15.0,Tachba Subwor,False
8621,9197_02,Europa,False,C/308/P,,41.0,True,0.0,7964.0,0.0,3238.0,5839.0,Aludram Platch,False
8652,9230_01,Europa,False,C/342/S,TRAPPIST-1e,36.0,True,0.0,5600.0,715.0,2868.0,971.0,,True


In [72]:
train[train['VIP'] == False]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8687,9275_03,Europa,,A/97/P,TRAPPIST-1e,30.0,False,0.0,3208.0,0.0,2.0,330.0,Atlasym Conable,True
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [76]:
train[train['VIP'] == False][train['CryoSleep'] == True]

  train[train['VIP'] == False][train['CryoSleep'] == True]


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True
10,0008_02,Europa,True,B/1/P,TRAPPIST-1e,34.0,False,0.0,0.0,,0.0,0.0,Altardr Flatic,True
18,0016_01,Mars,True,F/5/P,TRAPPIST-1e,45.0,False,0.0,0.0,0.0,0.0,0.0,Alus Upead,True
21,0020_01,Earth,True,E/0/S,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Almary Brantuarez,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8679,9267_02,Europa,True,E/607/S,TRAPPIST-1e,20.0,False,0.0,0.0,0.0,0.0,0.0,Sabi Opshaft,True
8680,9268_01,Earth,True,G/1505/P,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Agnesa Baldson,True
8681,9270_01,Earth,True,G/1497/S,55 Cancri e,33.0,False,0.0,0.0,0.0,0.0,0.0,Lan Mckinsond,True
8684,9274_01,,True,G/1508/P,TRAPPIST-1e,23.0,False,0.0,0.0,0.0,0.0,0.0,Chelsa Bullisey,True


## Spending - around 200 rows per column, though nulls will be in multiple columns

* If passenger is asleep, fill in 0 for any spending columns (68 rows)
* Could consider filling in 0; spending is not high in general - 50% quintile is like $5 spending
* Otherwise, maybe do an average of spending from other columns? Depends on how important the individual columns are
* Consider combining values from the spending columns into one column if values are correlated with each other and no one column is highly correlated with transported outcome



In [77]:
nan_rows[nan_rows['RoomService'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
25,0020_05,Earth,True,E/0/S,PSO J318.5-22,1.0,False,,0.0,0.0,0.0,0.0,Mael Brantuarez,False
35,0031_03,Mars,False,F/9/P,TRAPPIST-1e,20.0,False,,0.0,1750.0,990.0,0.0,Dontch Datie,True
83,0091_01,Earth,True,G/16/S,TRAPPIST-1e,26.0,False,,0.0,0.0,0.0,0.0,Deanne Yorkland,True
132,0141_01,Mars,False,F/30/P,TRAPPIST-1e,31.0,False,,0.0,97.0,0.0,0.0,Pyrohs Harte,False
170,0193_02,Mars,False,F/41/P,TRAPPIST-1e,23.0,False,,0.0,8.0,1072.0,0.0,Frook Raf,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8425,8998_02,Earth,False,E/591/S,TRAPPIST-1e,47.0,False,,1.0,0.0,967.0,5.0,Jonaye Englence,False
8450,9026_01,Earth,True,G/1463/P,TRAPPIST-1e,58.0,False,,0.0,0.0,0.0,0.0,Mathy Boyers,True
8525,9101_01,Earth,False,F/1865/P,TRAPPIST-1e,21.0,False,,0.0,496.0,430.0,0.0,Gera Frazie,False
8534,9112_01,Mars,False,D/290/P,TRAPPIST-1e,28.0,False,,0.0,0.0,0.0,0.0,Wealke Brin,False


In [78]:
nan_rows[nan_rows['RoomService'].isnull()][nan_rows['CryoSleep'] == True]

  nan_rows[nan_rows['RoomService'].isnull()][nan_rows['CryoSleep'] == True]


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
25,0020_05,Earth,True,E/0/S,PSO J318.5-22,1.0,False,,0.0,0.0,0.0,0.0,Mael Brantuarez,False
83,0091_01,Earth,True,G/16/S,TRAPPIST-1e,26.0,False,,0.0,0.0,0.0,0.0,Deanne Yorkland,True
233,0250_01,Earth,True,G/38/S,PSO J318.5-22,47.0,False,,0.0,0.0,0.0,0.0,Camily Kramosley,False
400,0438_01,Europa,True,B/13/S,TRAPPIST-1e,40.0,False,,0.0,0.0,0.0,0.0,Dyonon Diateous,True
889,0955_01,Earth,True,G/145/S,55 Cancri e,17.0,False,,0.0,0.0,0.0,0.0,Glena Smalloney,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8312,8871_02,Earth,True,G/1432/S,PSO J318.5-22,15.0,False,,0.0,,0.0,0.0,Nadie Benney,True
8361,8944_01,Earth,True,G/1442/S,PSO J318.5-22,0.0,False,,0.0,0.0,0.0,0.0,Calvia Hobbinson,True
8380,8961_01,Earth,True,G/1455/P,TRAPPIST-1e,1.0,False,,0.0,0.0,0.0,0.0,Mariel Blancoy,True
8412,8988_06,Earth,True,G/1448/S,TRAPPIST-1e,17.0,False,,0.0,0.0,0.0,0.0,Caseye Fowlesterez,True


In [83]:
train[train['CryoSleep'] == False].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,5343.0,5328.0,5329.0,5335.0,5326.0,5320.0
mean,29.651319,350.146772,713.004316,270.586504,486.09294,475.716165
std,14.07554,803.08032,1970.547985,741.756155,1396.233751,1404.174304
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,3.0,5.0,2.0,7.0,4.0
75%,38.0,390.25,537.0,242.0,354.75,356.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


## Name column (200 rows)

* This doesn't seem like it matters; fill in with some value representing unknown
* Unless last name or name contains some kind of context clue on other info, we can't use name for anything
* Would have to do some kind of processing of the name column to see if names indicate something like home planet, VIP, etc. for other missing info

In [79]:
nan_rows[nan_rows['Name'].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
27,0022_01,Mars,False,D/0/P,TRAPPIST-1e,21.0,False,980.0,2.0,69.0,0.0,0.0,,False
58,0064_01,Mars,True,F/14/S,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,,True
65,0069_01,Earth,False,F/16/S,TRAPPIST-1e,42.0,False,887.0,0.0,9.0,6.0,0.0,,True
77,0082_03,Mars,False,F/16/P,TRAPPIST-1e,8.0,False,0.0,0.0,0.0,0.0,0.0,,True
101,0108_02,Earth,False,G/19/S,TRAPPIST-1e,31.0,False,562.0,0.0,326.0,0.0,0.0,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8629,9205_02,Europa,True,B/300/P,TRAPPIST-1e,15.0,False,0.0,0.0,0.0,0.0,0.0,,True
8631,9208_01,Earth,True,G/1485/S,TRAPPIST-1e,35.0,False,0.0,0.0,0.0,0.0,0.0,,True
8636,9218_01,Europa,True,B/353/S,55 Cancri e,43.0,False,0.0,0.0,0.0,0.0,0.0,,True
8652,9230_01,Europa,False,C/342/S,TRAPPIST-1e,36.0,True,0.0,5600.0,715.0,2868.0,971.0,,True
