In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
import seaborn as sns

In [3]:
import os
from zipfile import ZipFile

In [4]:
file = ZipFile("./spaceship-titanic.zip",mode='r')

In [5]:
file.extractall()

### Data Preprocessing

In [6]:
train = pd.read_csv("train.csv")

In [7]:
test = pd.read_csv("test.csv")

In [8]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [9]:
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [10]:
X = train.iloc[:, :-1]
y = train.iloc[:,-1]

In [11]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [12]:
X.shape

(8693, 13)

In [13]:
mean_imputer = SimpleImputer(strategy="mean",missing_values=np.nan)         # For numerical values
frequent_imputer = SimpleImputer(strategy="most_frequent",missing_values=np.nan)           # For categorical values

In [14]:
X.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')

In [15]:
X.drop("PassengerId",axis=1,inplace=True)
X.drop("Name",axis=1,inplace=True)               # Dropping not required columns

#### Handling Missing Values

In [16]:
# Handling missing values
 
def preprocess(X):
    mean_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    freq_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination','VIP']
    
    for cols in mean_cols:
        X[cols] = mean_imputer.fit_transform(X[cols].values.reshape(-1,1))
    
    for cols in freq_cols:
        X[cols] = frequent_imputer.fit_transform(X[cols].values.reshape(-1,1))
        
    return X
    





In [17]:
X.isnull().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
dtype: int64

In [18]:
X = preprocess(X)

In [19]:
test.drop("PassengerId",axis=1,inplace=True)
test.drop("Name",axis=1,inplace=True)               # Dropping not required columns

In [20]:
X.shape

(8693, 11)

In [21]:
test.isnull().sum()

HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64

In [22]:
test = preprocess(test)

In [23]:
test

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,G/3/S,TRAPPIST-1e,27.000000,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F/4/S,TRAPPIST-1e,19.000000,False,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,C/0/S,55 Cancri e,31.000000,False,0.0,0.0,0.0,0.0,0.0
3,Europa,False,C/1/S,TRAPPIST-1e,38.000000,False,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,F/5/S,TRAPPIST-1e,20.000000,False,10.0,0.0,635.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4272,Earth,True,G/1496/S,TRAPPIST-1e,34.000000,False,0.0,0.0,0.0,0.0,0.0
4273,Earth,False,G/160/P,TRAPPIST-1e,42.000000,False,0.0,847.0,17.0,10.0,144.0
4274,Mars,True,D/296/P,55 Cancri e,28.658146,False,0.0,0.0,0.0,0.0,0.0
4275,Europa,False,D/297/P,TRAPPIST-1e,28.658146,False,0.0,2680.0,0.0,0.0,523.0


In [24]:
X.isnull().sum()

HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [25]:
test.isnull().sum()

HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

#### Handling Categorical Values

In [26]:
X.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0


In [27]:
# dropping cabin as multiple categorical values
X.drop("Cabin",inplace=True,axis=1)
test.drop("Cabin",inplace=True,axis=1)

In [28]:
X = pd.get_dummies(X,drop_first=True)

In [29]:
X.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,39.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,1,0
1,24.0,109.0,9.0,25.0,549.0,44.0,0,0,0,0,1,0
2,58.0,43.0,3576.0,0.0,6715.0,49.0,1,0,0,0,1,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,1,0,0,0,1,0
4,16.0,303.0,70.0,151.0,565.0,2.0,0,0,0,0,1,0


In [30]:
test = pd.get_dummies(test,drop_first=True)

In [31]:
test

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True
0,27.000000,0.0,0.0,0.0,0.0,0.0,0,0,1,0,1,0
1,19.000000,0.0,9.0,0.0,2823.0,0.0,0,0,0,0,1,0
2,31.000000,0.0,0.0,0.0,0.0,0.0,1,0,1,0,0,0
3,38.000000,0.0,6652.0,0.0,181.0,585.0,1,0,0,0,1,0
4,20.000000,10.0,0.0,635.0,0.0,0.0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4272,34.000000,0.0,0.0,0.0,0.0,0.0,0,0,1,0,1,0
4273,42.000000,0.0,847.0,17.0,10.0,144.0,0,0,0,0,1,0
4274,28.658146,0.0,0.0,0.0,0.0,0.0,0,1,1,0,0,0
4275,28.658146,0.0,2680.0,0.0,0.0,523.0,1,0,0,0,1,0


#### Scaling

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler()

In [34]:
X = scaler.fit_transform(X)

In [35]:
X.shape

(8693, 12)

In [36]:
test = scaler.fit_transform(test)

In [37]:
test.shape

(4277, 12)

#### Train test split

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
print("X shape : {}\ny Shape: {}".format(X.shape,y.shape))                          # checking shape

X shape : (8693, 12)
y Shape: (8693,)


In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y,shuffle=True,test_size=0.2)

In [41]:
from sklearn.preprocessing import StandardScaler

In [42]:
scaler = StandardScaler()

In [43]:
scaler

StandardScaler()

### Building and Running Model

In [45]:
from sklearn.tree import DecisionTreeClassifier

In [46]:
dtc = DecisionTreeClassifier()

In [47]:
dtc.fit(X_train,Y_train)

DecisionTreeClassifier()

In [48]:
predictions = dtc.predict(X_test)

In [49]:
predictions

array([ True, False, False, ...,  True, False,  True])

In [50]:
from sklearn.metrics import classification_report

In [51]:
print(classification_report(Y_test,predictions)) 

              precision    recall  f1-score   support

       False       0.73      0.71      0.72       852
        True       0.73      0.74      0.74       887

    accuracy                           0.73      1739
   macro avg       0.73      0.73      0.73      1739
weighted avg       0.73      0.73      0.73      1739



Acquired 74 % Accuracy

In [103]:
preds = dtc.predict(test)

In [104]:
submission = pd.read_csv("./test.csv")

In [105]:
submission.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [106]:
preds = pd.DataFrame(preds)

In [107]:
preds.rename(columns={0:"Transported"},inplace=True)

In [108]:
preds

Unnamed: 0,Transported
0,True
1,False
2,True
3,True
4,True
...,...
4272,True
4273,False
4274,True
4275,True


In [109]:
preds.index = submission.index

In [110]:
submission = pd.concat([submission,preds],axis=1)

In [111]:
submission.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
Transported       0
dtype: int64

In [114]:
submission1 = pd.DataFrame(pd.concat([submission["PassengerId"],submission["Transported"]],axis=1))

In [118]:
submission1.to_csv("submission1.csv",index=False)

In [119]:
from sklearn.ensemble import RandomForestClassifier

In [147]:
rfc = RandomForestClassifier(n_estimators=200)

In [148]:
rfc.fit(X_train,Y_train)

RandomForestClassifier(n_estimators=200)

In [149]:
preds = rfc.predict(X_test)

In [150]:
print(classification_report(Y_test,preds))

              precision    recall  f1-score   support

       False       0.77      0.79      0.78       852
        True       0.79      0.77      0.78       887

    accuracy                           0.78      1739
   macro avg       0.78      0.78      0.78      1739
weighted avg       0.78      0.78      0.78      1739



Aquired 78 % Accuracy

In [124]:
preds = rfc.predict(test)

In [125]:
submission = pd.read_csv("./test.csv")

In [126]:
submission.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [127]:
preds = pd.DataFrame(preds)

In [128]:
preds.rename(columns={0:"Transported"},inplace=True)

In [129]:
preds

Unnamed: 0,Transported
0,True
1,False
2,True
3,True
4,True
...,...
4272,True
4273,False
4274,True
4275,True


In [130]:
preds.index = submission.index

In [131]:
submission = pd.concat([submission,preds],axis=1)

In [132]:
submission.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
Transported       0
dtype: int64

In [133]:
submission2 = pd.DataFrame(pd.concat([submission["PassengerId"],submission["Transported"]],axis=1))

In [134]:
submission2.to_csv("submission2.csv",index=False)