In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv('https://raw.githubusercontent.com/Data-Nova/spaceship_titanic_challenge/refs/heads/main/train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/Data-Nova/spaceship_titanic_challenge/refs/heads/main/test.csv')

print(df_train.head())
print(df_test.head())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
  

In [3]:
print(list(df_train.columns))
print(list(df_test.columns))

['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported']
['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name']


In [4]:
print("Train data shape:", df_train.shape)
print("Test data shape:", df_test.shape)

print("\nTrain data info:")
df_train.info()

print("\nTest data info:")
df_test.info()

Train data shape: (8693, 14)
Test data shape: (4277, 13)

Train data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB

Test data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 

In [5]:
print("\nTrain data description:")
display(df_train.describe())

print("\nTest data description:")
display(df_test.describe())


Train data description:


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0



Test data description:


Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,4186.0,4195.0,4171.0,4179.0,4176.0,4197.0
mean,28.658146,219.266269,439.484296,177.295525,303.052443,310.710031
std,14.179072,607.011289,1527.663045,560.821123,1117.186015,1246.994742
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,53.0,78.0,33.0,50.0,36.0
max,79.0,11567.0,25273.0,8292.0,19844.0,22272.0


In [6]:
# Number of Unique values for each column in training data
print("Unique values in train data:")
display(df_train.nunique())

# Number of Unique values for each column in test data
print("\nUnique values in test data:")
display(df_test.nunique())

Unique values in train data:


Unnamed: 0,0
PassengerId,8693
HomePlanet,3
CryoSleep,2
Cabin,6560
Destination,3
Age,80
VIP,2
RoomService,1273
FoodCourt,1507
ShoppingMall,1115



Unique values in test data:


Unnamed: 0,0
PassengerId,4277
HomePlanet,3
CryoSleep,2
Cabin,3265
Destination,3
Age,79
VIP,2
RoomService,842
FoodCourt,902
ShoppingMall,715


In [7]:
# Extract information from Cabin column
def extract_cabin_info(df):
    cabin_info = df['Cabin'].str.split('/', expand=True)
    df['Cabin_Deck'] = cabin_info[0]
    df['Cabin_Num'] = pd.to_numeric(cabin_info[1], errors='coerce') # Convert to numeric, coercing errors to NaN
    df['Cabin_Side'] = cabin_info[2]
    return df

df_train = extract_cabin_info(df_train)
df_test = extract_cabin_info(df_test)

print(df_train[['Cabin', 'Cabin_Deck', 'Cabin_Num', 'Cabin_Side']].head())
print(df_test[['Cabin', 'Cabin_Deck', 'Cabin_Num', 'Cabin_Side']].head())


   Cabin Cabin_Deck  Cabin_Num Cabin_Side
0  B/0/P          B        0.0          P
1  F/0/S          F        0.0          S
2  A/0/S          A        0.0          S
3  A/0/S          A        0.0          S
4  F/1/S          F        1.0          S
   Cabin Cabin_Deck  Cabin_Num Cabin_Side
0  G/3/S          G        3.0          S
1  F/4/S          F        4.0          S
2  C/0/S          C        0.0          S
3  C/1/S          C        1.0          S
4  F/5/S          F        5.0          S


In [8]:
# After splitting the cabin column
# Number of Unique values for each column in training data
print("Unique values in train data:")
display(df_train.nunique())

# Number of Unique values for each column in test data
print("\nUnique values in test data:")
display(df_test.nunique())

Unique values in train data:


Unnamed: 0,0
PassengerId,8693
HomePlanet,3
CryoSleep,2
Cabin,6560
Destination,3
Age,80
VIP,2
RoomService,1273
FoodCourt,1507
ShoppingMall,1115



Unique values in test data:


Unnamed: 0,0
PassengerId,4277
HomePlanet,3
CryoSleep,2
Cabin,3265
Destination,3
Age,79
VIP,2
RoomService,842
FoodCourt,902
ShoppingMall,715


In [9]:
# Check for missing values in the training data
print("Missing values in train data:")
train_missing = df_train.isnull().sum()
train_missing_info = pd.DataFrame({'Missing Count': train_missing})
display(train_missing_info[train_missing_info['Missing Count'] > 0])

# Check for missing values in the test data
print("\nMissing values in test data:")
test_missing = df_test.isnull().sum()
test_missing_info = pd.DataFrame({'Missing Count': test_missing})
display(test_missing_info[test_missing_info['Missing Count'] > 0])

Missing values in train data:


Unnamed: 0,Missing Count
HomePlanet,201
CryoSleep,217
Cabin,199
Destination,182
Age,179
VIP,203
RoomService,181
FoodCourt,183
ShoppingMall,208
Spa,183



Missing values in test data:


Unnamed: 0,Missing Count
HomePlanet,87
CryoSleep,93
Cabin,100
Destination,92
Age,91
VIP,93
RoomService,82
FoodCourt,106
ShoppingMall,98
Spa,101
