In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')



In [3]:
print("Training Data Head:")
train_data.head()

Training Data Head:


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
print("Training Data Info:")
print(train_data.info())

print("Training Data Description:")
print(train_data.describe())

print("Training Data Columns:")
print(train_data.columns)

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
None
Training Data Description:
               Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  8514.

In [5]:
print("Test Data Head:")
test_data.head()

Test Data Head:


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [6]:
print("Testing Data Info:")
print(test_data.info())

print("Testing Data Description:")
print(test_data.describe())

print("Testing Data Columns:")
print(test_data.columns)

Testing Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB
None
Testing Data Description:
               Age   RoomService     FoodCourt  ShoppingMall           Spa  \
count  4186.000000   4195.000000   4171.000000   4179.000000   417

In [7]:
print("Training Data Missing Values:")
missing_values = train_data.isnull().sum
print(train_data.isnull().sum().to_markdown(numalign="left", stralign="left"))

Training Data Missing Values:
|              | 0   |
|:-------------|:----|
| PassengerId  | 0   |
| HomePlanet   | 201 |
| CryoSleep    | 217 |
| Cabin        | 199 |
| Destination  | 182 |
| Age          | 179 |
| VIP          | 203 |
| RoomService  | 181 |
| FoodCourt    | 183 |
| ShoppingMall | 208 |
| Spa          | 183 |
| VRDeck       | 188 |
| Name         | 200 |
| Transported  | 0   |


Data Cleaning and Preprocessing

In [8]:
def fillmissing(dattaf):
    dattaf['HomePlanet'] = dattaf['HomePlanet'].fillna('Earth')
    dattaf['CryoSleep'] = dattaf['CryoSleep'].fillna(False).infer_objects(copy=False)
    dattaf['Cabin'] = dattaf['Cabin'].fillna('Unknown')
    dattaf['Destination'] = dattaf['Destination'].fillna('TRAPPIST-1e')
    dattaf['Age'] = dattaf['Age'].fillna(dattaf['Age'].median())
    dattaf['VIP'] = dattaf['VIP'].fillna(False).infer_objects(copy=False)
    dattaf = dattaf.fillna(0)
    return dattaf

In [9]:
# apply the function to the data
train_data = fillmissing(train_data)
test_data = fillmissing(test_data)

In [10]:
#convert column into strin
label = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']

for i in label:
    train_data[i] = train_data[i].astype(str)
    test_data[i] = test_data[i].astype(str)

In [11]:
# combine the data
combined = pd.concat([train_data[label], test_data[label]], axis=0)

In [12]:
# label encoding
from sklearn.preprocessing import LabelEncoder
label_encoders = {col: LabelEncoder().fit(combined[col]) for col in label}

In [13]:
for col, label_encoder in label_encoders.items():
    train_data[col] = label_encoder.transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])

In [14]:
X = train_data.drop(['PassengerId', 'Name', 'Transported', ['Destination']], axis=1)
y = train_data['Transported'].astype(int)

X_test = test_data.drop(['PassengerId', 'Name'], axis=1)

In [15]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train_data.head()

In [16]:
# Build the model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# w

model.compile(optimizer='adagrad', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, monitor='loss')

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 16)                192       
                                                                 
 dense_1 (Dense)             (None, 64)                1088      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3,393
Trainable params: 3,393
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(X_train, y_train, epochs=300, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping])

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

KeyboardInterrupt: 

In [None]:
# predict with accuracy
y_pred = model.predict(X_val)
y_pred = np.round(y_pred).astype(int)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy}')



Accuracy: 0.7015526164462335
