In [1]:
import pandas as pd
import numpy as np 

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("train.csv")

data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
data.shape

(8693, 14)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [134]:
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [135]:
# All the attributes have missing values
# Must use preprocessing for all the attributes
#
# EXCLUDE attributes NAME, CABIN, PASSENGERID
#
# Cabin Attribute must be split to three attributes (delimiter: /)
# Convert VIP attribute to boolean
# CryoSleep to boolean
#
# PassengerTd -- string -> int, int (Two attributes)

In [5]:
data["VIP"].value_counts(), data["VIP"].isnull().sum()

(False    8291
 True      199
 Name: VIP, dtype: int64,
 203)

In [6]:
def fill_vip(data) :
    data["VIP"].astype(bool)
    data["VIP"].fillna(False, inplace=True)
    data["VIP"].replace([True, False], [1, 0], inplace = True)

In [7]:
data["CryoSleep"].isnull().sum(), data["CryoSleep"].value_counts()

(217,
 False    5439
 True     3037
 Name: CryoSleep, dtype: int64)

In [8]:
def fill_cryosleep(data):
    data["CryoSleep"].astype(bool)
    data["CryoSleep"].fillna(False, inplace = True)
    data["CryoSleep"].replace([True, False], [1, 0], inplace = True)

In [9]:
data["HomePlanet"].isnull().sum(), data["HomePlanet"].value_counts()

(201,
 Earth     4602
 Europa    2131
 Mars      1759
 Name: HomePlanet, dtype: int64)

In [10]:
def fill_homeplanet(data) :
    data["HomePlanet"].fillna("Earth", inplace = True)

    data["HomePlanet"].replace(["Earth", "Europa", "Mars"], [2, 3, 4], inplace = True)

In [11]:
def fill_destination(data) :
    data["Destination"].fillna("TRAPPIST-1e", inplace=True)

    data["Destination"].replace(["TRAPPIST-1e", "55 Cancri e", "PSO J318.5-22"], [2, 3, 4], inplace = True)

In [12]:
def fill_age(data):
    data["Age"].fillna(data["Age"].median(), inplace = True)

In [13]:
def fill_remain(data): 
    data.fillna(0, inplace = True)

In [14]:
fill_age(data)
fill_cryosleep(data)
fill_destination(data)
fill_homeplanet(data)
fill_vip(data)
fill_remain(data)

In [146]:
data.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64

In [15]:
columns = ["HomePlanet", "CryoSleep", "Destination", "Age", 
        "VIP","RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]
data_ss = data[columns]

In [16]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

data_ss = ss.fit_transform(data_ss)

In [17]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(
    data_ss, data["Transported"], test_size = 0.1)

In [31]:
data["Transported"]

KeyError: 'Transported'

In [150]:
from sklearn.metrics import confusion_matrix, accuracy_score

def evaluate_model(model) :
    model.fit(train_X, train_Y)
    pred = model.predict(test_X)
    print (confusion_matrix (pred, test_Y))
    print (accuracy_score (pred, test_Y))


In [151]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(random_state = 4)

In [174]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 10)

In [172]:
from sklearn.svm import SVC 

svm = SVC(C=0.1)

In [176]:
evaluate_model(log)

[[309  87]
 [117 357]]
0.7655172413793103


In [24]:
test = pd.read_csv("test.csv")

In [25]:
data = test

fill_age(data)
fill_cryosleep(data)
fill_destination(data)
fill_homeplanet(data)
fill_vip(data)
fill_remain(data)

In [179]:
test_ss = ss.transform(test[columns])

# Use all three models to predict
test["Transported"] = log.predict(test_ss)

test_res = test[["PassengerId", "Transported"]]

test_res.to_csv("test_res.csv", index=None)

In [163]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4277 non-null   int64  
 2   CryoSleep     4277 non-null   int64  
 3   Cabin         4277 non-null   object 
 4   Destination   4277 non-null   int64  
 5   Age           4277 non-null   float64
 6   VIP           4277 non-null   int64  
 7   RoomService   4277 non-null   float64
 8   FoodCourt     4277 non-null   float64
 9   ShoppingMall  4277 non-null   float64
 10  Spa           4277 non-null   float64
 11  VRDeck        4277 non-null   float64
 12  Name          4277 non-null   object 
 13  Transported   4277 non-null   bool   
dtypes: bool(1), float64(6), int64(4), object(3)
memory usage: 438.7+ KB


In [28]:
import tensorflow as tf

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(256),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(2, activation='sigmoid')
])

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

model.fit(train_X, train_Y, epochs=20)

Epoch 1/20


  output, from_logits = _get_logits(


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x238c3891dc0>

In [34]:
def get_max_index(arr):
    p = 0
    mx = 0
    for i in range(len(arr)) :
        if arr[i] > mx :
            mx = arr[i]
            p = i
    
    return p


label_arr = []

test_ss = ss.transform(test[columns])

test_pred = model.predict(test_ss)

for i in test_pred:
    label = get_max_index(i)
    label_arr.append(["FALSE", "TRUE"][label])


test["Transported"] = pd.Series(label_arr)

test_res = test[["PassengerId", "Transported"]]

test_res.to_csv("test_res.csv", index=None)

