In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import lightgbm as lgbm
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv")
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

In [3]:
train.head(), test.head()

(  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
 0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
 1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
 2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
 3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
 4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   
 
    RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
 0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
 1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
 2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
 3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
 4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   
 
    Transported  
 0        False  
 1         True  
 2        False  
 3        False  


In [4]:
train.shape, test.shape

((8693, 14), (4277, 13))

In [5]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  

(None, None)

In [6]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [7]:
train.isnull().sum(), test.isnull().sum()

(PassengerId       0
 HomePlanet      201
 CryoSleep       217
 Cabin           199
 Destination     182
 Age             179
 VIP             203
 RoomService     181
 FoodCourt       183
 ShoppingMall    208
 Spa             183
 VRDeck          188
 Name            200
 Transported       0
 dtype: int64,
 PassengerId       0
 HomePlanet       87
 CryoSleep        93
 Cabin           100
 Destination      92
 Age              91
 VIP              93
 RoomService      82
 FoodCourt       106
 ShoppingMall     98
 Spa             101
 VRDeck           80
 Name             94
 dtype: int64)

In [8]:
all_spaceship_data = pd.concat([train, test], ignore_index=True)

In [9]:
all_spaceship_data["PassengerGroup"] = all_spaceship_data["PassengerId"].apply(lambda x: x.split("_")[0])
all_spaceship_data["Deck"] = all_spaceship_data["Cabin"].apply(lambda x: x if pd.isnull(x) else x.split("/")[0])
all_spaceship_data["Side"] = all_spaceship_data["Cabin"].apply(lambda x: x if pd.isnull(x) else x.split("/")[2])
all_spaceship_data.drop(["PassengerId", "Cabin", "Name"], axis = 1, inplace = True)

In [10]:
#using pessenger travel group to fill in some missing values
travelGroup = ["Destination", "CryoSleep", "VIP", "Deck", "Side", "HomePlanet"]

for i in travelGroup:
    all_spaceship_data[i] = all_spaceship_data.groupby(["PassengerGroup"])[i].apply(
        lambda x: x.fillna(pd.Series.mode(x)[0]) if len(pd.Series.mode(x)) in range(1, 3) else x.fillna("None"))
    all_spaceship_data.loc[all_spaceship_data[i] == "None", i] = None

In [11]:
#using the mode to fill in the remaining missing values for object columns
for i in travelGroup:
    all_spaceship_data[i] = all_spaceship_data[i].fillna(all_spaceship_data[i].mode()[0])

In [12]:
OE = ["Destination", "CryoSleep", "VIP", "Deck", "Side", "HomePlanet", "PassengerGroup"]
ordinalEncoder = OrdinalEncoder()
all_spaceship_data[OE] = ordinalEncoder.fit_transform(all_spaceship_data[OE])

In [13]:
all_spaceship_data ["Transported"].replace({True: 1, False: 0}, inplace = True)

X = all_spaceship_data.drop("Transported", axis = 1)
y = all_spaceship_data["Transported"].dropna()

In [14]:
# using KNN imputer with a neighbor of 3 to fill in numerical missing values
knn_imp = KNNImputer(n_neighbors = 3)
X.loc[:,:] =knn_imp.fit_transform(X)
bins = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 80] 
labels = [1,2,3,4,5,6,7,8,9,10,11,12,13,14]

#cutting age in age groups by binning.
X["AgeGroup"] = pd.cut(x = X["Age"], bins = bins, labels = labels, include_lowest = True)
X["Total_spent"] = X["RoomService"] + X["FoodCourt"] + X["ShoppingMall"] + X["Spa"] + X["VRDeck"]

In [15]:
train = X.iloc[:8693,:]
test = X.iloc[8693:,:]

In [16]:
train.corr()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PassengerGroup,Deck,Side,Total_spent
HomePlanet,1.0,0.088922,0.036891,0.136235,0.124178,0.213651,0.070067,0.10434,0.055277,0.039858,-0.005318,-0.416538,0.001586,0.150705
CryoSleep,0.088922,1.0,-0.09767,-0.073326,-0.07788,-0.24542,-0.203577,-0.2098,-0.200642,-0.193781,-0.006133,0.011459,0.023201,-0.377145
Destination,0.036891,-0.09767,1.0,-0.007088,-0.045636,0.048036,-0.111404,0.025346,-0.057451,-0.074969,-0.001503,0.191398,-0.010359,-0.099948
Age,0.136235,-0.073326,-0.007088,1.0,0.0911,0.067818,0.12969,0.034948,0.124327,0.102921,-0.012076,-0.24827,0.011826,0.18863
VIP,0.124178,-0.07788,-0.045636,0.0911,1.0,0.056769,0.125811,0.020976,0.060388,0.122582,0.013608,-0.18279,-0.010288,0.163132
RoomService,0.213651,-0.24542,0.048036,0.067818,0.056769,1.0,-0.015804,0.059716,0.008869,-0.019756,0.000745,-0.031126,-0.012006,0.234759
FoodCourt,0.070067,-0.203577,-0.111404,0.12969,0.125811,-0.015804,1.0,-0.012403,0.221908,0.225444,-0.009274,-0.318222,0.018501,0.742164
ShoppingMall,0.10434,-0.2098,0.025346,0.034948,0.020976,0.059716,-0.012403,1.0,0.016045,-0.007755,0.015099,-0.032849,-0.0228,0.224009
Spa,0.055277,-0.200642,-0.057451,0.124327,0.060388,0.008869,0.221908,0.016045,1.0,0.149887,-0.004802,-0.225612,0.004139,0.593445
VRDeck,0.039858,-0.193781,-0.074969,0.102921,0.122582,-0.019756,0.225444,-0.007755,0.149887,1.0,0.015718,-0.252833,-0.00808,0.586177


In [17]:
train = X.iloc[:8693,:]
test = X.iloc[8693:,:]

In [18]:
#lgbm training and prediction
lg = lgbm.LGBMClassifier(n_estimators = 500, max_depth = 50, learning_rate = 0.02,  random_state=1)
lg.fit(train, y)
lgResult = lg.predict(test)
submission["Transported"] = lgResult
submission["Transported"].replace({1: True, 0: False}, inplace = True)
submission.to_csv('submission.csv', index=False)