# Spaceshift Titanic

# LogBook

In [None]:
# 10/07/22 --> Notebook Creation

# Author: Andres Montes de Oca
# GitHub: https://github.com/AndresMontesDeOca

# Load Data and Info

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st

%matplotlib inline
sns.set(style='white', context='notebook', palette='pastel')

train = pd.read_csv('Data/train.csv')
data = train
test = pd.read_csv('Data/test.csv')

print(data.info())
display(data.head())
# display(data.describe())
# display(data.describe(include='object'))

# Individual Features Check

## Transported (Target)

In [None]:
print(data['Transported'].value_counts(), '\n')

# Plot
fig, ax = plt.subplots(1, 1, figsize=(5, 2))
data['Transported'].value_counts().plot(kind='pie', autopct='%.1f%%')
plt.show()

# We will use it later
mask_transported = data['Transported']==True

# Transported is well balanced

## PassengerId (Feature)

In [None]:
print('Nulls:', data.PassengerId.isnull().sum(), '\n')
print(data.PassengerId.head())

# We will have to split the GroupId form this Feature, there are no nulls

#### Split (PassangerId)

## GroupSize (Created Feature)

In [None]:
# GroupId and GroupSize feature generation

data['GroupId'] = data['PassengerId'].str.split('_', expand=True)[0]

# Grouping by GroupId to count each group size
GroupId_size_serie = data.groupby('GroupId').size() # temp GroupId_size Serie
GroupId_size_serie.rename('GroupSize', inplace=True) # necesary for the Join

# First we add how many passengers are in the group
data = pd.merge(left=data, right=GroupId_size_serie, left_on='GroupId', right_on='GroupId', how='left')

print(data['GroupSize'].value_counts())

# Plot
fig, ax = plt.subplots(1, 1, figsize=(5, 3))
sns.countplot(y=data['GroupSize'], palette='Set2')
ax.set_ylabel('')

plt.title('GroupSize')
plt.show()

# Few Passangers in big groups

## IsAlone (Created Feature)

In [None]:
# IsAlone Feature Generation (Solo Travelers)

# Then we filter just the solo travelelrs
data.loc[:, 'IsAlone'] = data['GroupSize']==1 # Or we can leave it as it is, maybe improves the results
# data.rename(columns={'Group_size':'IsAlone'}, inplace=True)

print(data['IsAlone'].value_counts(), '\n')
# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))

# ax[0]
data['IsAlone'].value_counts(dropna=False).sort_index(ascending=False).plot(kind='pie', autopct='%.1f%%', ax=ax[0], 
                                                                            startangle=90)
ax[0].set_ylabel('')
ax[0].set_title('On-Board Rate')

# ax[1]
sns.countplot(y=data['IsAlone'], hue=data['Transported'], ax=ax[1], order=[True, False], hue_order=[True, False])
ax[1].set_ylabel('')
ax[1].set_title('Count and Comparison w/Target')

fig.suptitle('Solo Travellers\n')
plt.show()

# There are more Solo Travellers, but they have lower Transported Rate

## HomePlanet (Feature)

In [None]:
print(data['HomePlanet'].value_counts(dropna=False))

# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))
data['HomePlanet'].value_counts(dropna=False).plot(kind='pie', autopct='%.1f%%', ax=ax[0], startangle=90, cmap='Set2')
ax[0].set_ylabel('')
ax[0].set_title('Distribution')
sns.countplot(y=data['HomePlanet'], order=data['HomePlanet'].value_counts().index, ax=ax[1], 
              hue=data['Transported'], hue_order=[True, False])
ax[1].set_ylabel('')
ax[1].set_xlabel('')
ax[1].set_title('Count and Comparison w/Target')

fig.suptitle('HomePlanet')
plt.show()

# There are 201 nulls
# Most coming from Earth
# Passangers from Europe have more chances to be Transported
# Pending: Reduce annot_size for the PieChart

### Nulls Check

In [None]:
# Counting and Checking nulls
print('Nulls Count:', data[data.HomePlanet.isnull()].shape[0], '\n')

# Passangers in Groups
display('Passengers in groups', data[data.HomePlanet.isnull() & ~(data.IsAlone)]
        [['PassengerId', 'HomePlanet', 'IsAlone']].head())
print('In groups with other passengers',data[data.HomePlanet.isnull() & ~(data.IsAlone)].shape[0])

# We can get HomePlanet from GroupId for almost 50% of NA. Lets check if this is a valid approach

### Fill Nulls Check

In [None]:
# Lets confirm all members of each group member departed from the same HomePlanet

data_grp_groupid_homeplanet = data.groupby(['GroupId', 'HomePlanet']).size().to_frame('Count').reset_index()
display(data_grp_groupid_homeplanet.head())

# There are no duplicated GroupId, so all are from the same HomePlanet
print('Is GroupId unique?:', data_grp_groupid_homeplanet.GroupId.is_unique)

### Fill Nulls

In [None]:
# Working, but for later

# Filling Solo Travelers with the mode
# data.loc[data.HomePlanet.isnull() & data.IsAlone, 'HomePlanet'] = data.HomePlanet.mode()[0]

# Filling Passangers in Groups with value from same GroupId
# data.loc[:, 'HomePlanet'] = data.sort_values(by=['GroupId', 'HomePlanet'])['HomePlanet'].fillna(method='ffill')
# print('New Nulls Count:', data.HomePlanet.isnull().sum())

## CryoSleep (Feature)

In [None]:
print(data['CryoSleep'].value_counts(dropna=False).sort_index(ascending=False))

# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))

# ax[0]
data['CryoSleep'].value_counts(dropna=False).sort_index(ascending=False). \
                                        plot(kind='pie', autopct='%.1f%%', ax=ax[0], startangle=90)
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# ax[1]
sns.countplot(y=data['CryoSleep'], hue=data['Transported'], ax=ax[1], hue_order=[True, False])
ax[1].set_ylabel('')
ax[1].set_xlabel('')
ax[1].set_title('Count and Comparison w/Target')

fig.suptitle('CryoSleep')
plt.show()

# Just 1/3 are in CryoSleep
# There are 217 Nulls
# Strong correlation with Target Variable

### Correlation with VIP

In [None]:
print('Total VIP Passengers:', data[data.VIP==True].shape[0])

# Plot
fig, ax = plt.subplots(1, 1, figsize=(4, 3))
data[data.VIP==True].CryoSleep.value_counts().sort_index(ascending=False).plot(kind='pie', autopct='%.1f%%', startangle=90)
ax.set_ylabel('')
plt.suptitle('CryoSleep')
plt.show()

# There few VIP Passengers
# Strong correlation, VIP passengers dont want to be put to sleep

### Correlation with Age

In [None]:
# Plot
fig, ax = plt.subplots(1, 1, figsize=(6, 2))
data[data.CryoSleep==True].Age.value_counts().sort_index().plot(title='Count Passengers by Age', color='red', label='Sleeping Passengers')
data.Age.value_counts().sort_index().plot(color='blue', label='All Passengers')

plt.legend()
plt.show()

# Most Seniors (+65 years) are put to sleep, but they are few
# Same happens with kids (0 to 12 years)????
# The gap between both gets gradually reduced for adults (18 to 65 years) 

### Correlation with GroupSize

In [None]:
# Plot
fig, ax = plt.subplots(1, 1, figsize=(6, 2))
data[data.CryoSleep==True].GroupSize.value_counts().sort_index().plot(title='Count Passengers by GroupSize', 
                                                                      color='red', label='Sleeping Passengers')
data.GroupSize.value_counts().sort_index().plot(color='blue', label='All Passengers')

plt.yticks(np.arange(0, 5000, 1000))
plt.legend()
plt.show()

# The bigger the GroupSize, more chances of CryoSleep?

### FillNa

In [None]:
# For later
# print('Nulls Count Original:', data[data.CryoSleep.isnull()].shape[0], '\n')

# Complete with VIPs (90% Rate)
# data.loc[data.CryoSleep.isnull() & data.VIP, 'CryoSleep']=False # 214 to go
# print('Nulls Count after VIPs:', data[data.CryoSleep.isnull()].shape[0], '\n')

# Complete for Kids and Seniors (to be tested)
# data.loc[data.CryoSleep.isnull() & np.logical_or(data.Age>=65, data.Age<=12), 'CryoSleep'] = True
# print('Nulls Count after Age:', data[data.CryoSleep.isnull()].shape[0], '\n')

## Cabin (Feature)

In [None]:
print('Nulls:', data['Cabin'].isnull().sum(), '\n')
print(data['Cabin'].head()) # PortSide/StarboardSide: Left/Right

# Split into Deck/Num/Side, 199 nulls

## Destination (Feature)

In [None]:
print(data['Destination'].value_counts(dropna=False))

# Plots
fig, ax = plt.subplots(1, 2, figsize=(12, 2))

# ax[0]
data['Destination'].value_counts(dropna=False).plot(kind='pie', autopct='%.1f%%', ax=ax[0], startangle=90, cmap='Set2')
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# ax[1]
sns.countplot(y=data['Destination'], order=data['Destination'].value_counts().index, ax=ax[1], hue=data['Transported'])
ax[1].set_ylabel('')
ax[1].set_title('Count and Comparison w/Target')


plt.suptitle('Destination')
plt.show()

# Similar to HomePlanet. Most passengers heading TRAPPIST-1e, 182 Nulls
# 55 Cancri e looks like they have a better Transported Rate

## Age (Feature)

In [None]:
print('Nulls:', data['Age'].isnull().sum())
print("Skewness: %f" % data['Age'].skew(), '\n') # Right
print(data['Age'].describe())

# Plots
fig,ax=plt.subplots(1,1,figsize=(18,4))
sns.histplot(data=data['Age'], binwidth=5)

plt.title('Age')
plt.show()

# Will I have to descritize this one?

## VIP (Feature)

In [None]:
print(data['VIP'].value_counts(dropna=False).sort_index(ascending=False))

# Plot
fig,ax=plt.subplots(1,2,figsize=(12,2))

# ax[0]
data['VIP'].value_counts(dropna=False).plot(kind='pie', autopct='%.1f%%', startangle=90, cmap='Set2', ax=ax[0])
ax[0].set_ylabel('')
ax[0].set_title('Distribution')

# ax[1]
sns.countplot(y=data['VIP'], hue=data['Transported'], order=[True, False], hue_order=[True, False], ax=ax[1])
ax[1].set_ylabel('')
ax[1].set_title('Count and Comparison w/Target')
fig.suptitle('VIP')
plt.show()

# Few VIPS and few Nulls, nothing important here

## Name (Feature)

In [None]:
print('Nulls:', data['Name'].isnull().sum(), '\n')
print(data['Name'].value_counts())

# 200 nulls, couple of name repeted. Im pretty sure this Feature is useless

 ## RoomService (Feature)

In [None]:
print('Nulls:', data['RoomService'].isnull().sum(), '\n')
print(data['RoomService'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['RoomService'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['RoomService'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('RoomService with and without Outliers')
plt.show()

## FoodCourt (Feature)

In [None]:
print('Nulls:', data['FoodCourt'].isnull().sum(), '\n')
print(data['FoodCourt'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['FoodCourt'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['FoodCourt'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('FoodCourt with and without Outliers')
plt.show()

## ShoppingMall (Feature)

In [None]:
print('Nulls:', data['ShoppingMall'].isnull().sum(), '\n')
print(data['ShoppingMall'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['ShoppingMall'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['ShoppingMall'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('ShoppingMall with and without Outliers')
plt.show()

## Spa (Feature)

In [None]:
print('Nulls:', data['Spa'].isnull().sum(), '\n')
print(data['Spa'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['Spa'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['Spa'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('Spa with and without Outliers')
plt.show()

## VRDeck (Feature)

In [None]:
print('Nulls:', data['VRDeck'].isnull().sum(), '\n')
print(data['VRDeck'].describe())

# Plot
fig, ax = plt.subplots(2, 1, figsize=(8, 2))

# ax[0]
sns.boxplot(x=data['VRDeck'], showfliers=True, ax=ax[0])
ax[0].set_ylabel('')

# ax[1]
sns.boxplot(x=data['VRDeck'], showfliers=False, ax=ax[1])
ax[1].set_ylabel('')

fig.suptitle('VRDeck with and without Outliers')
plt.show()

## Continuous Features Comparation (PairPlot)

In [None]:
# Complete NA values with 0
data_plot = pd.concat([data.select_dtypes(['float64']).drop(columns='Age').fillna(0), data['Transported']], axis=1)

# Plot
sns.pairplot(data=data_plot, hue='Transported', hue_order=[True, False])
plt.show()

# I can't find any insights

# Correcting and Completing Nulls

In [None]:
data.info()