# Overview

In this notebook, we will train a baseline Random Forest model using tensorflow Decision Forests on the Spaceship Titanic.

In [1]:
import os
import warnings

warnings.filterwarnings('ignore')

os.environ['train']='/kaggle/input/spaceship-titanic/test.csv'
os.environ['test']='/kaggle/input/spaceship-titanic/train.csv'
os.environ['submission']='/kaggle/input/spaceship-titanic/sample_submission.csv'

# Loading the Data

In [9]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.float_format', lambda x: '%.3f'% x)

train=pd.read_csv(os.getenv('train'))
test=pd.read_csv(os.getenv('test'))
df=pd.concat([train, test], ignore_index=True).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  Transported   8693 non-null   object 
dtypes: float64(6), object(8)
memory usage: 1.4+ MB


# Checking Information

In [None]:
df.describe()

In [13]:
def check_df(df):
    print("############# Shape #############")
    print(df.shape)
    print("############# Types #############")
    print(df.dtypes)
    print("############# Head #############")
    print(df.head(3))
    print("############# Tail #############")
    print(df.tail(3))
    print("############# NA #############")
    print(df.isnull().sum())
    print("############# Quantiles #############")
    numeric_columns=df.select_dtypes(include=['number']).columns
    # return values at the given quantile over requested axis
    print(df[numeric_columns].quantile([0,0.05, 0.50,0.95,0.99],1).T)

check_df(df)

############# Shape #############
(12970, 14)
############# Types #############
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported      object
dtype: object
############# Head #############
  PassengerId HomePlanet CryoSleep  Cabin  Destination    Age    VIP  \
0     0013_01      Earth      True  G/3/S  TRAPPIST-1e 27.000  False   
1     0018_01      Earth     False  F/4/S  TRAPPIST-1e 19.000  False   
2     0019_01     Europa      True  C/0/S  55 Cancri e 31.000  False   

   RoomService  FoodCourt  ShoppingMall      Spa  VRDeck             Name  \
0        0.000      0.000         0.000    0.000   0.000  Nelly Carsoning   
1        0.000      9.000         0.000 2823.000   0.000   Lerome Peckers   
2        0.000   

# Visualization Dataset

In [24]:
def grab_col_names(df, cat_th=10, car_th=20):
    cat_cols=[col for col in df.columns if df[col].dtypes=="0"]
    num_but_cat=[col for col in df.columns if df[col].nunique() <cat_th and df[col].dtypes!="0"]
    cat_but_car=[col for col in df.columns if df[col].nunique()>car_th and df[col].dtypes=="0"]
    
    cat_cols=cat_cols+num_but_cat
    cat_cols=[col for col in cat_cols if col not in cat_but_car]
    
    num_cols=[col for col in df.columns if df[col].dtypes!="0"]
    num_cols=[col for col in num_cols if col not in num_but_cat]
    
    print(f"Observations: {df.shape[0]}")
    print(f"Variables: {df.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")
    return cat_cols, cat_but_car, num_cols

cat_cols, cat_but_car, num_cols=grab_col_names(df)

Observations: 12970
Variables: 14
cat_cols: 5
num_cols: 9
cat_but_car: 0
num_but_cat: 5


In [22]:
def cat_summary(dataframe, col_name, plot=False):
    print(pd.DataFrame({col_name: dataframe[col_name].value_counts(),
                        "Ratio": 100 * dataframe[col_name].value_counts() / len(dataframe)}))
for col in cat_cols:
    cat_summary(df, col)

            HomePlanet  Ratio
HomePlanet                   
Earth             6865 52.930
Europa            3133 24.156
Mars              2684 20.694
           CryoSleep  Ratio
CryoSleep                  
False           8079 62.290
True            4581 35.320
               Destination  Ratio
Destination                      
TRAPPIST-1e           8871 68.396
55 Cancri e           2641 20.362
PSO J318.5-22         1184  9.129
         VIP  Ratio
VIP                
False  12401 95.613
True     273  2.105
             Transported  Ratio
Transported                    
True                4378 33.755
False               4315 33.269


In [18]:
def num_summary(dataframe, numerical_col, plot=False):
    quantiles = [0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
    print(dataframe[numerical_col].describe(quantiles).T)

    if plot:
        dataframe[numerical_col].hist(bins=50)
        plt.xlabel(numerical_col)
        plt.title(numerical_col)
        plt.show()

    print("#####################################")
    
for col in num_cols:
    num_summary(df, col)

count       12970
unique      12970
top       0013_01
freq            1
Name: PassengerId, dtype: object
#####################################
count       12671
unique       9825
top       G/160/P
freq            8
Name: Cabin, dtype: object
#####################################
count   12700.000
mean       28.772
std        14.387
min         0.000
5%          4.000
10%        13.000
20%        18.000
30%        21.000
40%        24.000
50%        27.000
60%        30.000
70%        35.000
80%        41.000
90%        49.000
95%        56.000
99%        65.010
max        79.000
Name: Age, dtype: float64
#####################################
count   12707.000
mean      222.898
std       647.597
min         0.000
5%          0.000
10%         0.000
20%         0.000
30%         0.000
40%         0.000
50%         0.000
60%         0.000
70%         8.000
80%       178.000
90%       765.400
95%      1275.800
99%      3009.520
max     14327.000
Name: RoomService, dtype: float64
##########

In [19]:
def target_summary_with_cat(dataframe, target, categorical_col):
    print(pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean()}), end="\n\n\n")

for col in cat_cols:
    target_summary_with_cat(df,"Transported",col)

           TARGET_MEAN
HomePlanet            
Earth            0.424
Europa           0.659
Mars             0.523


          TARGET_MEAN
CryoSleep            
False           0.329
True            0.818


              TARGET_MEAN
Destination              
55 Cancri e         0.610
PSO J318.5-22       0.504
TRAPPIST-1e         0.471


      TARGET_MEAN
VIP              
False       0.506
True        0.382


            TARGET_MEAN
Transported            
False             0.000
True              1.000




In [23]:
import seaborn as sns

corr=df[num_cols].corr()
corr

ValueError: could not convert string to float: 'G/3/S'