<a href="https://www.kaggle.com/code/carlosmatos97/spaceship-titanic?scriptVersionId=213271536" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## This dataset was divided in 4 parts
### 1 - Data import
### 2 - Data treatment 
### 3 - Data load
### 4 - Predictions

In [1]:
#adapted by https://www.youtube.com/watch?v=_55G24aghPY&list=PL98nY_tJQXZnP-k3qCDd1hljVSciDV9_N
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score


In [2]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


### 2 - Data Treatment

In [4]:
train_data.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
transported_Europa = train_data.loc[train_data.HomePlanet == 'Europa']['Transported']
transported_Earth = train_data.loc[train_data.HomePlanet == 'Earth']['Transported']
transported_Mars = train_data.loc[train_data.HomePlanet == 'Mars']['Transported']

print("Europa transported: "+str(sum(transported_Europa)/len(transported_Europa)))
print("Earth transported: "+str(sum(transported_Earth)/len(transported_Earth)))
print("Mars transported: "+str(sum(transported_Mars)/len(transported_Mars)))
print()
print("Rate Europa transported: "+str(sum(transported_Europa)/(sum(transported_Europa)+sum(transported_Earth)+sum(transported_Mars))))
print("Rate Earth transported: "+str(sum(transported_Earth)/(sum(transported_Europa)+sum(transported_Earth)+sum(transported_Mars))))
print("Rate Mars transported: "+str(sum(transported_Mars)/(sum(transported_Europa)+sum(transported_Earth)+sum(transported_Mars))))

Europa transported: 0.65884561238855
Earth transported: 0.42394611038678837
Mars transported: 0.5230244457077885

Rate Europa transported: 0.32842105263157895
Rate Earth transported: 0.45637426900584793
Rate Mars transported: 0.2152046783625731


In [6]:
# I will use for nans values the Earth as defaut beacause its the data with more transported.
train_data['HomePlanet'].fillna('Earth', inplace=True)
test_data['HomePlanet'].fillna('Earth', inplace=True)
train_data['Cabin'].fillna('NoData/NoData/NoData', inplace=True)
test_data['Cabin'].fillna('NoData/NoData/NoData', inplace=True)
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
train_data['RoomService'].fillna(train_data['RoomService'].median(), inplace=True)
test_data['RoomService'].fillna(train_data['RoomService'].median(), inplace=True)
train_data['FoodCourt'].fillna(train_data['FoodCourt'].median(), inplace=True)
test_data['FoodCourt'].fillna(train_data['FoodCourt'].median(), inplace=True)
train_data['ShoppingMall'].fillna(train_data['ShoppingMall'].median(), inplace=True)
test_data['ShoppingMall'].fillna(train_data['ShoppingMall'].median(), inplace=True)
train_data['Spa'].fillna(train_data['Spa'].median(), inplace=True)
test_data['Spa'].fillna(train_data['Spa'].median(), inplace=True)
train_data['VRDeck'].fillna(train_data['VRDeck'].median(), inplace=True)
test_data['VRDeck'].fillna(train_data['VRDeck'].median(), inplace=True)

In [7]:
train_data['CryoSleep'] = pd.get_dummies(train_data["CryoSleep"], columns=["CryoSleep"])[1].to_numpy(dtype=np.uint8)
test_data['CryoSleep'] = pd.get_dummies(test_data["CryoSleep"], columns=["CryoSleep"])[1].to_numpy(dtype=np.uint8)

train_data=pd.concat([train_data, pd.get_dummies(train_data['Cabin'].str.split('/', expand=True)[0], columns=["Cabin"])], axis=1)
test_data=pd.concat([test_data, pd.get_dummies(test_data['Cabin'].str.split('/', expand=True)[0], columns=["Cabin"])], axis=1)
train_data.drop(['Cabin'], axis=1, inplace=True)
test_data.drop(['Cabin'], axis=1, inplace=True)
train_data.drop(['Name'], axis=1, inplace=True)
test_data.drop(['Name'], axis=1, inplace=True)


In [8]:
print(train_data["Destination"].unique())
total_destination = len(train_data.loc[train_data.Destination == 'TRAPPIST-1e'])+len(train_data.loc[train_data.Destination == 'PSO J318.5-22'])+len(train_data.loc[train_data.Destination == '55 Cancri e'])
print('Percent of each Destiny of total destination')
print(len(train_data.loc[train_data.Destination == 'TRAPPIST-1e'])/total_destination)
print(len(train_data.loc[train_data.Destination == 'PSO J318.5-22'])/total_destination)
print(len(train_data.loc[train_data.Destination == '55 Cancri e'])/total_destination)
train_data['Destination'].fillna('TRAPPIST-1e', inplace=True)
test_data['Destination'].fillna('TRAPPIST-1e', inplace=True)
print()
print(len(train_data.loc[train_data.Destination == 'TRAPPIST-1e'])/total_destination)
print(len(train_data.loc[train_data.Destination == 'PSO J318.5-22'])/total_destination)
print(len(train_data.loc[train_data.Destination == '55 Cancri e'])/total_destination)

train_data['VIP'].fillna(False, inplace=True)
test_data['VIP'].fillna(False, inplace=True)


['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
Percent of each Destiny of total destination
0.6949829632240629
0.09352602514393138
0.21149101163200565

0.716367054400188
0.09352602514393138
0.21149101163200565


In [9]:
train_home_planet_dummies = pd.get_dummies(train_data["HomePlanet"], columns=["HomePlanet"])
train_home_planet_dummies.columns = ['Home_Earth', 'Home_Europa','Home_Mars']
test_home_planet_dummies = pd.get_dummies(test_data["HomePlanet"], columns=["HomePlanet"])
test_home_planet_dummies.columns = ['Home_Earth', 'Home_Europa','Home_Mars']

train_data = pd.concat([train_data,train_home_planet_dummies],axis=1)
test_data = pd.concat([test_data,test_home_planet_dummies],axis=1)
train_data.drop(['HomePlanet'], axis=1, inplace=True)
test_data.drop(['HomePlanet'], axis=1, inplace=True)

train_Destination_dummies = pd.get_dummies(train_data["Destination"], columns=["Destination"])
train_Destination_dummies.columns = ['Dest_Earth', 'Dest_Europa','Dest_Mars']
test_Destination_dummies = pd.get_dummies(test_data["Destination"], columns=["Destination"])
test_Destination_dummies.columns = ['Dest_Earth', 'Dest_Europa','Dest_Mars']

train_data = pd.concat([train_data,train_Destination_dummies],axis=1)
test_data = pd.concat([test_data,test_Destination_dummies],axis=1)
train_data.drop(['Destination'], axis=1, inplace=True)
test_data.drop(['Destination'], axis=1, inplace=True)

train_vip_dummie = pd.get_dummies(train_data["VIP"], columns=["VIP"])[True]
test_vip_dummie = pd.get_dummies(test_data["VIP"], columns=["VIP"])[True]

aux_train = {'VIP': train_vip_dummie.to_numpy(dtype=np.uint8)}
aux_train = pd.DataFrame(aux_train)
aux_test = {'VIP': test_vip_dummie.to_numpy(dtype=np.uint8)}
aux_test = pd.DataFrame(aux_test)

train_data.drop(['VIP'], axis=1, inplace=True)
test_data.drop(['VIP'], axis=1, inplace=True)

train_data = pd.concat([train_data,aux_train],axis=1)
test_data = pd.concat([test_data,aux_test],axis=1)

In [10]:
train_data.dtypes

PassengerId      object
CryoSleep         uint8
Age             float64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported        bool
A                  bool
B                  bool
C                  bool
D                  bool
E                  bool
F                  bool
G                  bool
NoData             bool
T                  bool
Home_Earth         bool
Home_Europa        bool
Home_Mars          bool
Dest_Earth         bool
Dest_Europa        bool
Dest_Mars          bool
VIP               uint8
dtype: object

In [11]:
train_data['A'] = pd.get_dummies(train_data["A"], columns=["A"])[True].to_numpy(dtype=np.uint8)
test_data['A'] = pd.get_dummies(test_data["A"], columns=["A"])[True].to_numpy(dtype=np.uint8)

train_data['B'] = pd.get_dummies(train_data["B"], columns=["B"])[True].to_numpy(dtype=np.uint8)
test_data['B'] = pd.get_dummies(test_data["B"], columns=["B"])[True].to_numpy(dtype=np.uint8)

train_data['C'] = pd.get_dummies(train_data["C"], columns=["C"])[True].to_numpy(dtype=np.uint8)
test_data['C'] = pd.get_dummies(test_data["C"], columns=["C"])[True].to_numpy(dtype=np.uint8)

train_data['D'] = pd.get_dummies(train_data["D"], columns=["D"])[True].to_numpy(dtype=np.uint8)
test_data['D'] = pd.get_dummies(test_data["D"], columns=["D"])[True].to_numpy(dtype=np.uint8)

train_data['E'] = pd.get_dummies(train_data["E"], columns=["E"])[True].to_numpy(dtype=np.uint8)
test_data['E'] = pd.get_dummies(test_data["E"], columns=["E"])[True].to_numpy(dtype=np.uint8)

train_data['F'] = pd.get_dummies(train_data["F"], columns=["F"])[True].to_numpy(dtype=np.uint8)
test_data['F'] = pd.get_dummies(test_data["F"], columns=["F"])[True].to_numpy(dtype=np.uint8)

train_data['G'] = pd.get_dummies(train_data["G"], columns=["G"])[True].to_numpy(dtype=np.uint8)
test_data['G'] = pd.get_dummies(test_data["G"], columns=["G"])[True].to_numpy(dtype=np.uint8)

train_data['NoData'] = pd.get_dummies(train_data["NoData"], columns=["NoData"])[True].to_numpy(dtype=np.uint8)
test_data['NoData'] = pd.get_dummies(test_data["NoData"], columns=["NoData"])[True].to_numpy(dtype=np.uint8)

train_data['T'] = pd.get_dummies(train_data["T"], columns=["T"])[True].to_numpy(dtype=np.uint8)
test_data['T'] = pd.get_dummies(test_data["T"], columns=["T"])[True].to_numpy(dtype=np.uint8)

train_data['Home_Earth'] = pd.get_dummies(train_data["Home_Earth"], columns=["Home_Earth"])[True].to_numpy(dtype=np.uint8)
test_data['Home_Earth'] = pd.get_dummies(test_data["Home_Earth"], columns=["Home_Earth"])[True].to_numpy(dtype=np.uint8)

train_data['Home_Europa'] = pd.get_dummies(train_data["Home_Europa"], columns=["Home_Europa"])[True].to_numpy(dtype=np.uint8)
test_data['Home_Europa'] = pd.get_dummies(test_data["Home_Europa"], columns=["Home_Europa"])[True].to_numpy(dtype=np.uint8)

train_data['Home_Mars'] = pd.get_dummies(train_data["Home_Mars"], columns=["Home_Mars"])[True].to_numpy(dtype=np.uint8)
test_data['Home_Mars'] = pd.get_dummies(test_data["Home_Mars"], columns=["Home_Mars"])[True].to_numpy(dtype=np.uint8)

train_data['Dest_Earth'] = pd.get_dummies(train_data["Dest_Earth"], columns=["Dest_Earth"])[True].to_numpy(dtype=np.uint8)
test_data['Dest_Earth'] = pd.get_dummies(test_data["Dest_Earth"], columns=["Dest_Earth"])[True].to_numpy(dtype=np.uint8)

train_data['Dest_Europa'] = pd.get_dummies(train_data["Dest_Europa"], columns=["Dest_Europa"])[True].to_numpy(dtype=np.uint8)
test_data['Dest_Europa'] = pd.get_dummies(test_data["Dest_Europa"], columns=["Dest_Europa"])[True].to_numpy(dtype=np.uint8)

train_data['Dest_Mars'] = pd.get_dummies(train_data["Dest_Mars"], columns=["Dest_Mars"])[True].to_numpy(dtype=np.uint8)
test_data['Dest_Mars'] = pd.get_dummies(test_data["Dest_Mars"], columns=["Dest_Mars"])[True].to_numpy(dtype=np.uint8)

train_data['Transported'] = pd.get_dummies(train_data["Transported"], columns=["Transported"])[True].to_numpy(dtype=np.uint8)

In [12]:
test_data.dtypes

PassengerId      object
CryoSleep         uint8
Age             float64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
A                 uint8
B                 uint8
C                 uint8
D                 uint8
E                 uint8
F                 uint8
G                 uint8
NoData            uint8
T                 uint8
Home_Earth        uint8
Home_Europa       uint8
Home_Mars         uint8
Dest_Earth        uint8
Dest_Europa       uint8
Dest_Mars         uint8
VIP               uint8
dtype: object

In [13]:
train_data['Age']=train_data['Age']/train_data['Age'].max()
train_data['RoomService']=train_data['RoomService']/train_data['RoomService'].max()
train_data['FoodCourt']=train_data['FoodCourt']/train_data['FoodCourt'].max()
train_data['ShoppingMall']=train_data['ShoppingMall']/train_data['ShoppingMall'].max()
train_data['Spa']=train_data['Spa']/train_data['Spa'].max()
train_data['VRDeck']=train_data['VRDeck']/train_data['VRDeck'].max()

test_data['Age']=test_data['Age']/test_data['Age'].max()
test_data['RoomService']=test_data['RoomService']/test_data['RoomService'].max()
test_data['FoodCourt']=test_data['FoodCourt']/test_data['FoodCourt'].max()
test_data['ShoppingMall']=test_data['ShoppingMall']/test_data['ShoppingMall'].max()
test_data['Spa']=test_data['Spa']/test_data['Spa'].max()
test_data['VRDeck']=test_data['VRDeck']/test_data['VRDeck'].max()

In [14]:
series = pd.Series(train_data[train_data.columns].corr()['Transported'], name="Transported")
series.sort_values(ascending=False)

Transported     1.000000
CryoSleep       0.460132
Home_Europa     0.176916
B               0.144733
Dest_Earth      0.108722
C               0.108193
FoodCourt       0.045583
PassengerId     0.021491
Home_Mars       0.019544
G               0.016269
ShoppingMall    0.009391
Dest_Europa     0.000092
NoData         -0.000340
A              -0.002623
T              -0.014568
D              -0.034046
VIP            -0.037261
Age            -0.074249
F              -0.087753
Dest_Mars      -0.096319
E              -0.097965
Home_Earth     -0.168845
VRDeck         -0.204874
Spa            -0.218545
RoomService    -0.241124
Name: Transported, dtype: float64

In [15]:
#train_data = train_data.drop(series[series < -0.20].sort_values(ascending=False).index.tolist(), axis = 1)
#test_data = test_data.drop(series[series < -0.20].sort_values(ascending=False).index.tolist(), axis = 1)

In [16]:
train_data

Unnamed: 0,PassengerId,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,A,...,G,NoData,T,Home_Earth,Home_Europa,Home_Mars,Dest_Earth,Dest_Europa,Dest_Mars,VIP
0,0001_01,0,0.493671,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0002_01,0,0.303797,0.007608,0.000302,0.001064,0.024500,0.001823,1,0,...,0,0,0,1,0,0,0,0,1,0
2,0003_01,0,0.734177,0.003001,0.119948,0.000000,0.299670,0.002030,0,1,...,0,0,0,0,1,0,0,0,1,1
3,0003_02,0,0.417722,0.000000,0.043035,0.015793,0.148563,0.007997,0,1,...,0,0,0,0,1,0,0,0,1,0
4,0004_01,0,0.202532,0.021149,0.002348,0.006428,0.025214,0.000083,1,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,0,0.518987,0.000000,0.228726,0.000000,0.073322,0.003066,0,1,...,0,0,0,0,1,0,1,0,0,1
8689,9278_01,1,0.227848,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,...,1,0,0,1,0,0,0,1,0,0
8690,9279_01,0,0.329114,0.000000,0.000000,0.079687,0.000045,0.000000,1,0,...,1,0,0,1,0,0,0,0,1,0
8691,9280_01,0,0.405063,0.000000,0.035186,0.000000,0.015753,0.134049,0,0,...,0,0,0,0,1,0,1,0,0,0


In [17]:
y = train_data['Transported'].to_numpy(dtype=np.bool8)
train_data.drop(columns=['Transported'],inplace=True )

In [18]:
model = RandomForestClassifier(n_estimators=1000, max_depth=10, min_samples_leaf= 2, random_state=1)
#scores = cross_val_score(model, train_data, y, cv=5)#print("Mean Accuracy was: ",sum(scores)/len(scores))
#print(scores)
model.fit(train_data,y)
predictions = model.predict(test_data)

output = pd.DataFrame({'PassengerId':test_data.PassengerId,'Transported':predictions})
output.to_csv('Titanic_sub1.csv', index=False)
print("Success")

Success
