# Kaggle [Spaceship Titanic](https://www.kaggle.com/competitions/spaceship-titanic/overview) contest

The objective of this contest is to evaluate if certain passengers were transported or not.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os

for dirname, _, filenames in os.walk('./'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train_data = pd.read_csv("./train.csv")
train_data.head()

test_data = pd.read_csv("./test.csv")
numberOfTrees=200
maximumDepth=5
minimumLeaves=10
minimumSplit=20

In [None]:
test_data.head()

In [None]:
# Test Data parsing
Seperate_Passenger_ID = test_data["PassengerId"].str.split("_", n=1, expand=True)
test_data["PassengerGroup"]=Seperate_Passenger_ID[0]
test_data["PassengerNumber"]=Seperate_Passenger_ID[1]

Seperate_Cabin=test_data["Cabin"].str.split("/", n=2, expand=True)
test_data["Deck"]=Seperate_Cabin[0]
test_data["DeckNumber"]=Seperate_Cabin[1]
test_data["DeckSide"]=Seperate_Cabin[2]
test_data.drop(columns=["Cabin"], inplace=True)


In [None]:
# Train Data parsing

Seperate_Passenger_ID = train_data["PassengerId"].str.split("_", n=1, expand=True)
train_data["PassengerGroup"]=Seperate_Passenger_ID[0]
train_data["PassengerNumber"]=Seperate_Passenger_ID[1]

Seperate_Cabin=train_data["Cabin"].str.split("/", n=2, expand=True)
train_data["Deck"]=Seperate_Cabin[0]
train_data["DeckNumber"]=Seperate_Cabin[1]
train_data["DeckSide"]=Seperate_Cabin[2]
train_data.drop(columns=["Cabin"], inplace=True)

In [None]:
train_data['DeckNumber'] = train_data['DeckNumber'].astype('Int64')
train_data['PassengerGroup'] = train_data['PassengerGroup'].astype('Int64')
train_data['PassengerNumber'] = train_data['PassengerNumber'].astype('Int64')
# train_data['CryoSleep'] = train_data['CryoSleep'].map({'True': True, 'False': False})
train_data['HomePlanet'] = train_data['HomePlanet'].map({'Earth': 0, 'Europa': 1, 'Mars': 2})
train_data['Destination'] = train_data['Destination'].map({'55 Cancri e': 0, 'PSO J318.5-22': 1,'TRAPPIST-1e': 2})
train_data['Deck'] = train_data['Deck'].map({'A': 0,'B': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'H': 7,'I': 8,'J': 9,'K': 10,'L': 11,'M': 12,'N': 13,'O': 14,'P': 15,'Q': 16,'R': 17,'S': 18,'T': 19,'U': 20,'V': 21,'W': 22,'X': 23,'Y': 24,'Z': 25})
train_data['DeckSide']=train_data['DeckSide'].map({'S': 0, 'P': 1})

test_data['DeckNumber'] = test_data['DeckNumber'].astype('Int64')
test_data['PassengerGroup'] = test_data['PassengerGroup'].astype('Int64')
test_data['PassengerNumber'] = test_data['PassengerNumber'].astype('Int64')
# test_data['CryoSleep'] = test_data['CryoSleep'].map({'True': True, 'False': False})
test_data['HomePlanet'] = test_data['HomePlanet'].map({'Earth': 0, 'Europa': 1, 'Mars': 2})
test_data['Destination'] = test_data['Destination'].map({'55 Cancri e': 0, 'PSO J318.5-22': 1,'TRAPPIST-1e': 2})
test_data['Deck'] = test_data['Deck'].map({'A': 0,'B': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,'H': 7,'I': 8,'J': 9,'K': 10,'L': 11,'M': 12,'N': 13,'O': 14,'P': 15,'Q': 16,'R': 17,'S': 18,'T': 19,'U': 20,'V': 21,'W': 22,'X': 23,'Y': 24,'Z': 25})
test_data['DeckSide']=test_data['DeckSide'].map({'S': 0, 'P': 1})

print(train_data.dtypes)

# train_data = train_data.astype({'PassengerGroup': int, 'PassengerNumber': int, 'DeckNumber': int})
# test_data = test_data.astype({'PassengerGroup': int, 'PassengerNumber': int, 'DeckNumber': int})


In [None]:
# Checking data

fig , ax = plt.subplots(figsize=(6,4))
sns.countplot(x='Transported', data=train_data)
plt.title("Count of Survival")
plt.show()

n=len(train_data)
surv_0=len(train_data[train_data['Transported']==True])
surv_1=len(train_data[train_data['Transported']==False])

print("% of passanger survived in train dataset: ",surv_1*100/n)
print("% of passanger not survived in train dataset: ",surv_0*100/n)

In [None]:
correlation=train_data[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','PassengerGroup', 'PassengerNumber', 'Deck', 'DeckNumber', 'DeckSide']].corr()

fig, ax =plt.subplots(figsize=(8, 6))
plt.title("Correlation Plot")
sns.heatmap(correlation, mask=np.zeros_like(correlation, dtype=bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.show()


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import warnings

warnings.filterwarnings('ignore')

X = train_data.drop(['Name', 'Transported'], axis = 1)

y = train_data['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

rfc = RandomForestClassifier()

forest_params = [{'max_depth': list(range(1, 10)), 'max_features': list(range(1,8)), "min_samples_leaf": list(range(1,9))}]

clf = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy')

clf.fit(X_train, y_train)

print(clf.best_params_)

print(clf.best_score_)

{'max_depth': 9, 'max_features': 4, 'min_samples_leaf': 2}
0.80493529081324


In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Transported"]

features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','PassengerGroup', 'PassengerNumber', 'Deck', 'DeckNumber', 'DeckSide']
X = pd.get_dummies(train_data[features])
print(X)
X_test = pd.get_dummies(test_data[features])
print(X_test)

model = RandomForestClassifier(n_estimators=numberOfTrees, max_depth=maximumDepth, min_samples_leaf=minimumLeaves,min_samples_split=minimumSplit,random_state=1)
model.fit(X, y)

In [None]:
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.loc[:,"PassengerId"], 'Transported': predictions})
output.to_csv('submissionTestingSettings.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
train_data_answers=train_data["Transported"]

train_data_predictions=model.predict(X)
train_data_predictions_df = pd.DataFrame({'PassengerId': train_data.loc[:,"PassengerId"], 'Transported': train_data_predictions})


total=0
correct=0

for i in range(len(train_data_predictions_df)):
    predictedRow=train_data_predictions_df.iloc[i]
    predictedId=predictedRow["PassengerId"]
    predictedDiagnosis=predictedRow["Transported"]

    trainRow=train_data.iloc[i]
    trainId=trainRow["PassengerId"]
    trainDiagnosis=trainRow["Transported"]

    if trainId!=predictedId:
        print("not same id at row "+str(i))
    else:
        total=total+1
        if predictedDiagnosis==trainDiagnosis:
            correct=correct+1

print("Total rows: "+str(total)+"\nCorrect diagnosis: "+str(correct)+"\nPercentage correct: "+str(correct/total))


In [None]:
from sklearn.ensemble import RandomForestClassifier

training_data=train_data.iloc[:80*len(train_data)//100]
validation_data=train_data.iloc[80*len(train_data)//100:]

y_training = training_data["Transported"]

features = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','PassengerGroup', 'PassengerNumber', 'Deck', 'DeckNumber', 'DeckSide']

X_Training = pd.get_dummies(training_data[features])
X_Validation = pd.get_dummies(validation_data[features])
validation_data_answers=validation_data[["PassengerId","Transported"]]


bestAccuracy=0
bestNumberOfTrees=0
bestDepth=0

for trees in range(50,550,50):
    for depth in range(1,16):
        model = RandomForestClassifier(n_estimators=trees, max_depth=depth, random_state=1)
        model.fit(X_Training, y_training)

        validation_data_predictions=model.predict(X_Validation)
        validation_data_predictions_df = pd.DataFrame({'PassengerId': validation_data.loc[:,"PassengerId"], 'Transported': validation_data_predictions})


        total=0
        correct=0

        for i in range(len(validation_data_predictions_df)):
            predictedRow=validation_data_predictions_df.iloc[i]
            predictedId=predictedRow["PassengerId"]
            predictedDiagnosis=predictedRow["Transported"]

            trainRow=validation_data_answers.iloc[i]
            trainId=trainRow["PassengerId"]
            trainDiagnosis=trainRow["Transported"]

            if trainId!=predictedId:
                print("not same id at row "+str(i))
            else:
                total=total+1
                if predictedDiagnosis==trainDiagnosis:
                    correct=correct+1

        print("Number of trees: "+str(trees))
        print("Maximum depth of trees: "+str(depth))
        print("Percentage correct: "+str(correct/total)+"\n")

        if correct/total>bestAccuracy:
            bestAccuracy=correct/total
            bestNumberOfTrees=trees
            bestDepth=depth


print("Top accuracy: "+str(bestAccuracy))
print("Number of trees: "+str(bestNumberOfTrees))
print("Depth: "+ str(bestDepth))
