In [99]:
import pandas as pd
import numpy as np

original_data = pd.read_csv("data/train.csv")
original_data = original_data.sample(frac=1).reset_index(drop=True)
total = len(original_data)
train_size = int(total * 0.80)

data = original_data[:train_size]
test_data = original_data[train_size:]

In [100]:
def step_1(data, test=False):
    data = data.drop(["PassengerId", "Name", "Cabin"], axis=1)
    data["CryoSleep"] = data["CryoSleep"].factorize()[0]
    data["VIP"] = data["VIP"].factorize()[0]

    if not test:
        data["Transported"] = data["Transported"].factorize()[0]
        
    return data

### Remove NaN

In [101]:
def remove_nan(data):
    for column in data.columns:
        
        if data[column].dtype == "float64":
            average = data[column].mean()
            data[column] = data[column].fillna(average)

        else:
            data[column] = data[column].fillna(f"{column}_None")

    return data

In [102]:
def one_hot_encode(data):
    planet_dummies = pd.get_dummies(data["HomePlanet"])
    destination_dummies = pd.get_dummies(data["Destination"])

    for column in planet_dummies.columns:
        planet_dummies[column] = planet_dummies[column].factorize()[0]

    for column in destination_dummies.columns:
        destination_dummies[column] = destination_dummies[column].factorize()[0]

    data = data.join(planet_dummies)
    data = data.join(destination_dummies)
    data = data.drop(["HomePlanet", "Destination"], axis=1)
    return data

In [103]:
data = step_1(data)
data = remove_nan(data)
data = one_hot_encode(data)

test_data = step_1(test_data)
test_data = remove_nan(test_data)
test_data = one_hot_encode(test_data)

data.head()

HomePlanet: has: 0 / 6954 missing values
CryoSleep: has: 0 / 6954 missing values
Destination: has: 0 / 6954 missing values
Age: has: 0 / 6954 missing values
VIP: has: 0 / 6954 missing values
RoomService: has: 0 / 6954 missing values
FoodCourt: has: 0 / 6954 missing values
ShoppingMall: has: 0 / 6954 missing values
Spa: has: 0 / 6954 missing values
VRDeck: has: 0 / 6954 missing values
Transported: has: 0 / 6954 missing values
HomePlanet: has: 0 / 1739 missing values
CryoSleep: has: 0 / 1739 missing values
Destination: has: 0 / 1739 missing values
Age: has: 0 / 1739 missing values
VIP: has: 0 / 1739 missing values
RoomService: has: 0 / 1739 missing values
FoodCourt: has: 0 / 1739 missing values
ShoppingMall: has: 0 / 1739 missing values
Spa: has: 0 / 1739 missing values
VRDeck: has: 0 / 1739 missing values
Transported: has: 0 / 1739 missing values


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Earth,Europa,HomePlanet_None,Mars,55 Cancri e,Destination_None,PSO J318.5-22,TRAPPIST-1e
0,0,40.0,0,0.0,0.0,2414.0,0.0,0.0,0,0,0,0,0,0,0,0,0
1,1,26.0,0,0.0,0.0,0.0,0.0,0.0,0,1,1,0,0,1,0,0,1
2,0,19.0,0,0.0,162.0,540.0,0.0,0.0,1,0,0,0,0,0,0,0,0
3,0,34.0,0,0.0,3126.0,0.0,112.0,15.0,0,1,1,0,0,1,0,0,1
4,0,26.0,0,2042.0,0.0,2387.0,10.0,0.0,0,1,0,0,1,0,0,1,1


In [104]:
test_data.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Earth,Europa,HomePlanet_None,Mars,55 Cancri e,Destination_None,PSO J318.5-22,TRAPPIST-1e
6954,0,38.0,0,1517.0,0.0,0.0,27.0,0.0,0,0,0,0,0,0,0,0,0
6955,0,35.0,0,1248.0,0.0,55.0,12.0,0.0,1,1,0,0,1,0,0,1,1
6956,1,20.0,0,0.0,0.0,0.0,0.0,0.0,1,1,0,0,1,0,0,0,0
6957,0,42.0,0,0.0,0.0,1677.0,65.0,62.0,1,0,0,0,0,0,0,0,0
6958,0,17.0,0,372.0,0.0,477.0,0.0,0.0,0,0,0,0,0,0,0,0,0


In [105]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import numpy as np

y = data["Transported"]
x = data.drop(["Transported"], axis=1)

test_y = test_data["Transported"]
test_x = test_data.drop(["Transported"], axis=1)


model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(x, y)
predictions = model.predict(test_x)

test_y = test_y.reset_index(drop=True)

total_correct = 0
for index, prediction in enumerate(test_y):

    if prediction == test_y[index]:
        total_correct += 1

print("R^2 : ", r2_score(test_y, predictions))
print("MAE :", mean_absolute_error(test_y, predictions))
print("RMSE:",np.sqrt(mean_squared_error(test_y, predictions)))
print(f"Accuracy: {(total_correct / len(test_y)) * 100}")


R^2 :  -2.066264246852785
MAE : 0.7659574468085106
RMSE: 0.8751899489873673
Accuracy: 100.0


In [114]:
loaded_test = pd.read_csv("data/test.csv")
final_test = step_1(loaded_test, test=True)
final_test = remove_nan(final_test)
final_test = one_hot_encode(final_test)

final_predictions = model.predict(final_test)



HomePlanet: has: 0 / 4277 missing values
CryoSleep: has: 0 / 4277 missing values
Destination: has: 0 / 4277 missing values
Age: has: 0 / 4277 missing values
VIP: has: 0 / 4277 missing values
RoomService: has: 0 / 4277 missing values
FoodCourt: has: 0 / 4277 missing values
ShoppingMall: has: 0 / 4277 missing values
Spa: has: 0 / 4277 missing values
VRDeck: has: 0 / 4277 missing values
Your submission was successfully saved!


In [117]:
actual_submission = []
for prediction in final_predictions:
    if prediction ==  1:
        actual_submission.append(True)
    else:
        actual_submission.append(False)

output = pd.DataFrame({'PassengerId': loaded_test.PassengerId, 'Transported': actual_submission})
output.to_csv('data/submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!
