In [1]:
import numpy as np
import pandas as pd 

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Exploratory data analysis(EDA)

In [3]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

KNNImputer - "Imputation for completing missing values using k-Nearest Neighbors.
Each sample’s missing values are imputed using the mean value from n_neighbors nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close."

In [9]:
from sklearn.impute import KNNImputer

train_data['Transported'] = train_data['Transported']*1
train_data['CryoSleep'] = train_data['CryoSleep']*1
test_data['CryoSleep'] = test_data['CryoSleep']*1

imputer = KNNImputer(n_neighbors=2, weights="uniform")


train_data[["Deck", "Num", "Side"]] = train_data["Cabin"].str.split("/", expand=True)
train_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']] = imputer.fit_transform(train_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']])



test_data[["Deck", "Num", "Side"]] = test_data["Cabin"].str.split("/", expand=True)
test_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']] = imputer.fit_transform(test_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']])


train_data['HomePlanet'].fillna("Earth", inplace=True)
train_data['Destination'].fillna("TRAPPIST-1e", inplace=True)
train_data['Deck'].fillna("F", inplace=True)
train_data['Side'].fillna("P", inplace=True)



test_data['HomePlanet'].fillna("Earth", inplace=True)
test_data['Destination'].fillna("TRAPPIST-1e", inplace=True)
test_data['Deck'].fillna("F", inplace=True)
test_data['Side'].fillna("P", inplace=True)

LabelEncoder - "Encode target labels with value between 0 and n_classes-1."

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data["Deck"]= le.fit_transform(train_data["Deck"])

le = LabelEncoder()
train_data["Num"]= le.fit_transform(train_data["Num"])

le = LabelEncoder()
train_data["Side"]= le.fit_transform(train_data["Side"])

le = LabelEncoder()
test_data["Deck"]= le.fit_transform(test_data["Deck"])

le = LabelEncoder()
test_data["Num"]= le.fit_transform(test_data["Num"])

le = LabelEncoder()
test_data["Side"]= le.fit_transform(test_data["Side"])

Make new feature called "AllSpending" 

In [11]:
train_data['AllSpending'] = train_data['RoomService'] + train_data['ShoppingMall'] + train_data['FoodCourt'] + train_data['Spa'] + train_data['VRDeck']
test_data['AllSpending'] = test_data['AllShop'] = test_data['RoomService'] + test_data['ShoppingMall'] + test_data['FoodCourt'] + test_data['Spa'] + test_data['VRDeck']

# Feature selection, scaling

In [12]:
from sklearn.preprocessing import StandardScaler

y = train_data["Transported"]


features = ["CryoSleep","RoomService","Spa","VRDeck","Deck","Side","AllSpending"]

cols_to_normalize = ["RoomService","Spa","VRDeck","AllSpending"]
scaler = StandardScaler()

train_data[cols_to_normalize ] = scaler.fit_transform(train_data[cols_to_normalize ])
test_data[cols_to_normalize ] = scaler.transform(test_data[cols_to_normalize ])

X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# Model training

In [14]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

cat_model = CatBoostClassifier(verbose=0) 
cat_model.fit(X, y)

y_pred = cat_model.predict(X)
print(confusion_matrix(y, y_pred))
asc=accuracy_score(y, y_pred)
print(asc)
predictions = cat_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': predictions})
output = output.replace({0: False, 1: True})

output.to_csv('submission_catboost.csv', index=False)
print("Submission was successfully saved!")

[[3444  871]
 [ 548 3830]]
0.836765213390084
Submission was successfully saved!
