In [2]:
from IPython.display import clear_output
!pip install fancyimpute
!pip install statsmodels miceforest
clear_output()

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from fancyimpute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras import models, Sequential, layers, Model, metrics


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
TRAIN_PATH = '/content/drive/MyDrive/Spaceship-titanic/train.csv'
TEST_PATH = '/content/drive/MyDrive/Spaceship-titanic/test.csv'
TARGET_CLASS = 'Transported'

In [6]:
#functions

def missing_value(df):
      missing_values = df.isna().sum().to_frame()
      missing_values = missing_values.rename(columns= {0: 'missing_values'})
      missing_values['% of total'] = (missing_values['missing_values'] / df.shape[0]).round(2)*100
      return missing_values

In [7]:
df = pd.read_csv(TRAIN_PATH)

In [8]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
#missing column
missing = missing_value(df)
missing

Unnamed: 0,missing_values,% of total
PassengerId,0,0.0
HomePlanet,201,2.0
CryoSleep,217,2.0
Cabin,199,2.0
Destination,182,2.0
Age,179,2.0
VIP,203,2.0
RoomService,181,2.0
FoodCourt,183,2.0
ShoppingMall,208,2.0


In [None]:
fig = px.bar(x=missing.index, y='missing_values', data_frame=missing)
fig.update_layout(width=800, height=600)
fig.show()

In [None]:
#replace null values in the dataframe

for col in df.columns:
  if df[str(col)].dtypes == 'object':
    df[str(col)].fillna(df[str(col)].mode()[0], inplace=True)
  elif df[str(col)].dtypes == 'float64':
    df[str(col)].fillna(df[str(col)].mean().round(0), inplace=True)



In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin         8693 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8693 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(5)
memory usage: 772.6+ KB


In [11]:
#create categorical column for age

map_age = {'Children': range(0,13),
           'Teenagers': range(13,18),
           'young Adult': range(18,30),
           'Middle Age': range(30,50),
           'Elderly' : range(50,100)
           }

df['Age_category'] = df['Age'].apply(lambda x:next((k for k, v in map_age.items() if x in v), 'unknown'))

Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

In [12]:
df['Cabin'].value_counts

<bound method IndexOpsMixin.value_counts of 0          B/0/P
1          F/0/S
2          A/0/S
3          A/0/S
4          F/1/S
          ...   
8688      A/98/P
8689    G/1499/S
8690    G/1500/S
8691     E/608/S
8692     E/608/S
Name: Cabin, Length: 8693, dtype: object>

In [13]:
df[['Deck', 'Cabin_num', 'side']] = df['Cabin'].str.split('/', expand=True)

In [14]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Age_category,Deck,Cabin_num,side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,Middle Age,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,young Adult,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,Elderly,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,Middle Age,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,Teenagers,F,1,S


In [15]:
df['Age'].value_counts()

24.0    324
18.0    320
21.0    311
19.0    293
23.0    292
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0      2
Name: Age, Length: 80, dtype: int64

In [16]:
df['Spa']

0          0.0
1        549.0
2       6715.0
3       3329.0
4        565.0
         ...  
8688    1643.0
8689       0.0
8690       1.0
8691     353.0
8692       0.0
Name: Spa, Length: 8693, dtype: float64

In [17]:
#replace bool with 1, 0 
bool_column = df.select_dtypes(include='bool').columns

df[bool_column] = df[bool_column].replace([True, False],[1, 0])

In [18]:
drop_data = []
for col in df.columns:
  if len(df[str(col)].unique())/len(df[str(col)]) > 0.2:
    drop_data.append(col)

In [19]:
drop_data.append('Age')

In [20]:
df_clean = df.drop(columns=drop_data, axis=1)
df_clean.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Age_category,Deck,side
0,Europa,False,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0,0,Middle Age,B,P
1,Earth,False,TRAPPIST-1e,False,109.0,9.0,25.0,549.0,44.0,1,young Adult,F,S
2,Europa,False,TRAPPIST-1e,True,43.0,3576.0,0.0,6715.0,49.0,0,Elderly,A,S
3,Europa,False,TRAPPIST-1e,False,0.0,1283.0,371.0,3329.0,193.0,0,Middle Age,A,S
4,Earth,False,TRAPPIST-1e,False,303.0,70.0,151.0,565.0,2.0,1,Teenagers,F,S


In [21]:
object_column = df_clean.select_dtypes(include='object').columns.to_list()

In [22]:
df_clean1 = pd.get_dummies(df_clean, columns=object_column)

In [23]:
df_clean1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   RoomService                8512 non-null   float64
 1   FoodCourt                  8510 non-null   float64
 2   ShoppingMall               8485 non-null   float64
 3   Spa                        8510 non-null   float64
 4   VRDeck                     8505 non-null   float64
 5   Transported                8693 non-null   int64  
 6   HomePlanet_Earth           8693 non-null   uint8  
 7   HomePlanet_Europa          8693 non-null   uint8  
 8   HomePlanet_Mars            8693 non-null   uint8  
 9   CryoSleep_False            8693 non-null   uint8  
 10  CryoSleep_True             8693 non-null   uint8  
 11  Destination_55 Cancri e    8693 non-null   uint8  
 12  Destination_PSO J318.5-22  8693 non-null   uint8  
 13  Destination_TRAPPIST-1e    8693 non-null   uint8

# Mice 

In [28]:
mice_imputer = IterativeImputer()
df_column = df_clean1.columns
X_imputed = mice_imputer.fit_transform(df_clean1)

In [29]:
X_imputed

array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.090e+02, 9.000e+00, 2.500e+01, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [4.300e+01, 3.576e+03, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       ...,
       [0.000e+00, 0.000e+00, 1.872e+03, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [0.000e+00, 1.049e+03, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [1.260e+02, 4.688e+03, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [33]:
df_imputed = pd.DataFrame(X_imputed)
df_imputed.columns = df_column
df_imputed.head()
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   RoomService                8693 non-null   float64
 1   FoodCourt                  8693 non-null   float64
 2   ShoppingMall               8693 non-null   float64
 3   Spa                        8693 non-null   float64
 4   VRDeck                     8693 non-null   float64
 5   Transported                8693 non-null   float64
 6   HomePlanet_Earth           8693 non-null   float64
 7   HomePlanet_Europa          8693 non-null   float64
 8   HomePlanet_Mars            8693 non-null   float64
 9   CryoSleep_False            8693 non-null   float64
 10  CryoSleep_True             8693 non-null   float64
 11  Destination_55 Cancri e    8693 non-null   float64
 12  Destination_PSO J318.5-22  8693 non-null   float64
 13  Destination_TRAPPIST-1e    8693 non-null   float

In [None]:
df_clean1.corr()['Transported']

CryoSleep                    0.460132
VIP                         -0.037261
RoomService                 -0.242049
FoodCourt                    0.046074
ShoppingMall                 0.010020
Spa                         -0.218791
VRDeck                      -0.204825
Transported                  1.000000
HomePlanet_Earth            -0.168845
HomePlanet_Europa            0.176916
HomePlanet_Mars              0.019544
Destination_55 Cancri e      0.108722
Destination_PSO J318.5-22    0.000092
Destination_TRAPPIST-1e     -0.096319
Age_category_Children        0.125399
Age_category_Elderly        -0.011422
Age_category_Middle Age     -0.036782
Age_category_Teenagers       0.030376
Age_category_young Adult    -0.049551
Deck_A                      -0.002623
Deck_B                       0.144733
Deck_C                       0.108193
Deck_D                      -0.034046
Deck_E                      -0.097965
Deck_F                      -0.087753
Deck_G                       0.015822
Deck_T      

In [None]:
for col in df_clean1.columns:
  fig = px.bar(df_clean1, x=str(col), color='Transported')
  fig.update_layout(width=800, height=600)
  fig.show()

In [36]:
#visualise dataset in 3d space with pca

X = df_imputed.drop('Transported', axis=1)
y = df_imputed['Transported']

n_components = 3
pca  = PCA(n_components=n_components)

x_pca = pca.fit_transform(X)
a = pca.explained_variance_ratio_
n = 1
for i in a:
  print(f'Explained variance of component {n} {i.round(2)*100}')
  n+=1


df_pca = pd.DataFrame(x_pca, columns=['comp_1', 'comp_2', 'comp_3'])
df_pca['Transported'] = y

df_pca


Explained variance of component 1 47.0
Explained variance of component 2 21.0
Explained variance of component 3 18.0


Unnamed: 0,comp_1,comp_2,comp_3,Transported
0,-590.557280,-217.129275,-23.889946,0.0
1,-418.682734,155.579006,346.495719,1.0
2,4558.108427,2624.223785,4802.370942,0.0
3,1560.317518,1503.123479,2258.130814,0.0
4,-372.697642,114.931734,394.051460,1.0
...,...,...,...,...
8688,6133.882087,-1859.982877,1109.662800,0.0
8689,-590.557713,-217.129585,-23.889978,0.0
8690,-599.127566,-198.436565,6.315686,1.0
8691,1404.243969,1734.111692,-1998.477750,0.0


In [37]:
a = pca.explained_variance_ratio_
n = 1
for i in a:
  print(f'Explained variance of  component{n} {i.round(2)*100}')
  n+=1


Explained variance of  component1 47.0
Explained variance of  component2 21.0
Explained variance of  component3 18.0


In [38]:
fig = px.scatter_3d(df_pca, x='comp_1', y='comp_2', z='comp_3', color='Transported', title='visualisation of Dataset in 3d space')
fig.show()

In [41]:
X = df_imputed.drop('Transported', axis=1)
y = df_imputed['Transported']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [50]:
def model():
    model = Sequential(layers=[
        layers.Input(shape=(x_train_scaled.shape[1]), dtype='float32'),
        layers.Normalization(name='normalisation'),
        layers.Dense(256, activation='relu', name='dense1'),
        layers.Dropout(0.3, name='dropout'),
        layers.Dense(512, activation='relu', name='dense2'),
        layers.Dropout(0.3, name='dropout2'),
        layers.Dense(1024, activation='relu', name='dense3'),
        layers.Dense(1, activation='sigmoid', name='output')
    ]
    )
    return model


In [51]:
model = model()

In [52]:
EPOCHS = 50
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [53]:
model.compile(optimizer=optimizer,
            loss='binary_crossentropy', metrics=['accuracy', 'Precision'])

In [54]:
history = model.fit(
    x_train_scaled, y_train, 
    validation_data=(x_test_scaled, y_test), 
    epochs=EPOCHS,
    batch_size=256
    )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
