In [74]:
import pandas as pd
import numpy as np
import re

# Data Visualization and Conclusions

In [75]:
train_data = pd.read_csv("train-test-datasets/train.csv")
test_data = pd.read_csv("train-test-datasets/test.csv")

submission = pd.read_csv("train-test-datasets/sample_submission.csv")
train_data.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
4751,5068_01,Earth,,E/314/P,TRAPPIST-1e,19.0,False,0.0,0.0,0.0,0.0,0.0,Nadie Polliamposs,True
20,0017_02,Earth,False,F/6/P,55 Cancri e,14.0,False,412.0,0.0,1.0,0.0,679.0,Philda Brighttt,False
2621,2806_01,Mars,False,D/95/S,TRAPPIST-1e,28.0,False,948.0,0.0,64.0,170.0,0.0,Boon Gene,False
3922,4184_01,Earth,False,F/860/P,55 Cancri e,19.0,False,558.0,0.0,353.0,0.0,8.0,Thel Wilsoney,False
7237,7741_01,Earth,False,G/1252/S,TRAPPIST-1e,3.0,False,0.0,0.0,0.0,0.0,0.0,Lilly Matts,False


In [76]:
# Use .fillna(mean or median or mode), later

train_data = train_data.dropna()
test_data = test_data.dropna()

test_data.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
182,0367_01,Europa,False,D/11/S,55 Cancri e,55.0,False,0.0,538.0,0.0,6.0,5288.0,Misa Dischod
262,0562_02,Mars,True,F/104/S,TRAPPIST-1e,13.0,False,0.0,0.0,0.0,0.0,0.0,Sex Shcad
467,0977_01,Earth,True,G/147/S,PSO J318.5-22,19.0,False,0.0,0.0,0.0,0.0,0.0,Samany Alest
2554,5588_01,Mars,False,F/1066/S,TRAPPIST-1e,24.0,False,18.0,25.0,0.0,0.0,1256.0,Weres Cola
2229,4821_01,Earth,False,F/909/S,TRAPPIST-1e,14.0,False,14.0,0.0,135.0,0.0,818.0,Diandy Arneras


Splitting Cabin into three different columns

In [77]:
def cabin_split(data):
    # Compile the expected format: "Deck/Number/Side" like "B/123/P"
    pattern = re.compile(r'^([a-zA-Z]+)/(\d+)(/[a-zA-Z]+)?$')

    def split_cabin(val):
        if pd.isna(val):
            return pd.Series([pd.NA, pd.NA, pd.NA])
        if not pattern.match(val):
            print(f"Invalid format: {val}")
            return pd.Series([pd.NA, pd.NA, pd.NA])
        parts = val.split("/")
        deck = parts[0] if len(parts) > 0 else pd.NA
        number = parts[1] if len(parts) > 1 else pd.NA
        side = parts[2] if len(parts) > 2 else pd.NA
        return pd.Series([deck, number, side])

    # Apply the split_cabin function to each row
    data[["Cabin_deck", "Cabin_num", "Cabin_side"]] = data["Cabin"].apply(split_cabin)

    return data

train_data = cabin_split(train_data)
test_data = cabin_split(test_data)

test_data.sample(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Cabin_deck,Cabin_num,Cabin_side
1926,4126_01,Earth,False,F/845/P,TRAPPIST-1e,15.0,False,25.0,0.0,0.0,5.0,3295.0,Sha Gallencis,F,845,P
3489,7591_01,Earth,False,E/494/P,TRAPPIST-1e,29.0,False,150.0,1459.0,0.0,0.0,0.0,Karay Wolfernan,E,494,P
4261,9240_01,Earth,False,E/594/P,TRAPPIST-1e,15.0,False,0.0,0.0,4.0,0.0,618.0,Glendy Reenon,E,594,P
2520,5513_01,Earth,True,G/888/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Gracia Mayods,G,888,S
3643,7934_01,Mars,False,F/1519/S,TRAPPIST-1e,3.0,False,0.0,0.0,0.0,0.0,0.0,Razzle Mith,F,1519,S


One-hot encoding the column - Destination

In [78]:
print(set(test_data["Destination"]))

# data = pd.get_dummies(data, columns=['Destination'], prefix='Destin', prefix_sep='_')

# data.rename({ 
#     "Dest_55 Cancri e": "Destin_55_Cancri_e",
#     "Dest_PSO J318.5-22": "Destin_PSO_J318.5-22",
# }, inplace=True)

dest_split = {
    'TRAPPIST-1e': 1,
    '55 Cancri e': 2,
    'PSO J318.5-22': 3
}

train_data["Destination"] = train_data["Destination"].map(dest_split)
test_data["Destination"] = test_data["Destination"].map(dest_split)

{'PSO J318.5-22', 'TRAPPIST-1e', '55 Cancri e'}


In [79]:
set(test_data["Cabin_deck"]), set(test_data["Cabin_side"])

# Encodes the Deck and Side columns from Alphabets to Numeric
def cabin_deck_side_numeric(data):
    d = list(set(data["Cabin_deck"]))
    cabin_deck_split = { d[i-1]:i for i in range(1, len(d)+1) }
    
    d = list(set(data["Cabin_side"]))
    cabin_side_split = { d[i-1]:i for i in range(1, len(d)+1) }
    
    data["Cabin_deck"] = data["Cabin_deck"].map(cabin_deck_split)
    data["Cabin_side"] = data["Cabin_side"].map(cabin_side_split)

cabin_deck_side_numeric(train_data)
cabin_deck_side_numeric(test_data)

print(test_data.columns)

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Cabin_deck', 'Cabin_num', 'Cabin_side'],
      dtype='object')


In [80]:
true_false_split = {
    True: 1,
    False: 0
}
home_planet_split = {
    'Earth': 1,
    'Europa': 2,
    'Mars': 3
}

def true_false_to_numeric(data):
    data["VIP"] = data["VIP"].map(true_false_split)
    data["CryoSleep"] = data["CryoSleep"].map(true_false_split)
    data["HomePlanet"] = data["HomePlanet"].map(home_planet_split)

    if "Transported" in data.columns:
        data["Transported"] = data["Transported"].map(true_false_split)


true_false_to_numeric(train_data)
true_false_to_numeric(test_data)

In [81]:
# data[ data["VIP"] == 1 ]

train_data["Cabin_num"] = [ int(x) for x in train_data["Cabin_num"] ]
test_data["Cabin_num"] = [ int(x) for x in test_data["Cabin_num"] ]


train_data.sample(5)

train_data.Cabin_num.describe()

train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_deck,Cabin_num,Cabin_side
0,0001_01,2,0,B/0/P,1,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,2,0,2
1,0002_01,1,0,F/0/S,1,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,3,0,1
2,0003_01,2,0,A/0/S,1,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,7,0,1
3,0003_02,2,0,A/0/S,1,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,7,0,1
4,0004_01,1,0,F/1/S,1,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,2,0,A/98/P,2,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,7,98,2
8689,9278_01,1,1,G/1499/S,3,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,8,1499,1
8690,9279_01,1,0,G/1500/S,1,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,8,1500,1
8691,9280_01,2,0,E/608/S,2,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,1,608,1


# Model Training

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

'''
Random Forest models
Decision tree
XG boost
Light GBM
KNN
'''
# y_train = train_data["Transported"]
# X_train = train_data.drop(columns = ["PassengerId", "Cabin", "Name", "Transported"])

# # y_test = test_data["Transported"]
# X_test = test_data.drop(columns = ["PassengerId", "Cabin", "Name"])


y = train_data["Transported"]
X = train_data.drop(columns = ["PassengerId", "Cabin", "Name", "Transported"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [83]:
model_list = [
    SVC(), GaussianNB(), DecisionTreeClassifier(),
    GradientBoostingClassifier(), KNeighborsClassifier(),
    RandomForestClassifier(),
    # RandomForestModel()
]

best_score = 0
for model in model_list:
    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)
    print(str(model), "-", score)
    
    y_pred = model.predict(X_test)
    
    print("Number of mislabeled points out of a total %d points : %d"
          % (X_test.shape[0], (y_test != y_pred).sum()))
    # print(classification_report(y_test, y_pred))
    print()

    if score > best_score:
        best_model = model
        best_score = score

print(f"Best model is {best_model} with score {best_score}")

SVC() - 0.7829046898638427
Number of mislabeled points out of a total 1322 points : 287

GaussianNB() - 0.6694402420574886
Number of mislabeled points out of a total 1322 points : 437

DecisionTreeClassifier() - 0.7549167927382754
Number of mislabeled points out of a total 1322 points : 324

GradientBoostingClassifier() - 0.8071104387291982
Number of mislabeled points out of a total 1322 points : 255

KNeighborsClassifier() - 0.7655068078668684
Number of mislabeled points out of a total 1322 points : 310

RandomForestClassifier() - 0.8093797276853253
Number of mislabeled points out of a total 1322 points : 252

Best model is RandomForestClassifier() with score 0.8093797276853253


In [84]:
model = XGBClassifier(enable_categorical=True)

model.fit(X_train, y_train)

print(str(model), "-", model.score(X_test, y_test))

y_pred = model.predict(X_test)

print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))
print(classification_report(y_test, y_pred))
print()

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...) - 0.8018154311649016
Number of mislabeled points out of a total 1322 points : 262
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       681
           1       0.80      0.80      0.80      

## Data Cleaning