In [1]:
# Importing used libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from tensorflow import keras




In [2]:
# Reading train and test data
raw_df = pd.read_csv('spaceship-titanic.csv')
raw_test = pd.read_csv('test.csv')
df = raw_df.copy()
test = raw_test.copy()

In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
# Filling nan values of some numerical columns with most frequent which is 0
paid_cols = [
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck',
]
df[paid_cols] = df[paid_cols].fillna(0)
test[paid_cols] = test[paid_cols].fillna(0)

# Creating additive feature
df['Paid'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
test['Paid'] = test['RoomService'] + test['FoodCourt'] + test['ShoppingMall'] + test['Spa'] + test['VRDeck']

In [5]:
# Imputing test (and train) data nan values with most frequency strategy
imputer = SimpleImputer(strategy='most_frequent')
#df_filled = imputer.fit_transform(df)
test_filled = imputer.fit_transform(test)
#df = pd.DataFrame(df_filled, columns=df.columns)
test = pd.DataFrame(test_filled, columns=test.columns)

In [6]:
# Splitting the Cabin column and creating 3 new columns that mat be meaningful
df[['deck', 'num', 'side']] = df['Cabin'].str.split('/', expand=True)
test[['deck', 'num', 'side']] = test['Cabin'].str.split('/', expand=True)


In [7]:
# Label encoding the new side column as it has only 2 unique values
le = LabelEncoder()
df['side'] = le.fit_transform(df['side'])
test['side'] = le.fit_transform(test['side'])

In [8]:
# Droping unimportant columns
redun = {
    'Name',
    'PassengerId',
    'Cabin',
    'num'
}
df.drop(redun, axis = 1, inplace = True)
test.drop(redun, axis = 1, inplace = True)

In [9]:
df.columns

Index(['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported', 'Paid',
       'deck', 'side'],
      dtype='object')

In [10]:
# Imputing Age column with Random Forest model
target_variable = 'Age'
df['Age'] = df['Age'].replace(0, np.nan)

predictor_variables = ['CryoSleep', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

imputer = SimpleImputer(strategy='mean')
predictor_df = df.copy()
predictor_df[predictor_variables] = imputer.fit_transform(predictor_df[predictor_variables])

train_data = predictor_df[~predictor_df[target_variable].isna()]
test_data = predictor_df[predictor_df[target_variable].isna()]

model = RandomForestClassifier()

model.fit(train_data[predictor_variables], train_data[target_variable])

missing_values = model.predict(test_data[predictor_variables])

df.loc[df[target_variable].isna(), target_variable] = missing_values

In [11]:
# Descretizing Age column

age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79']

df['AgeRange'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)
test['AgeRange'] = pd.cut(test['Age'], bins=age_bins, labels=age_labels)

df.drop('Age', axis = 1, inplace = True)
test.drop('Age', axis = 1, inplace = True)


In [12]:
# Droping other rows with nan values

df = df.dropna()

In [13]:
# Casting boolean columns as integers
cols = ['CryoSleep', 'VIP']
df['Transported'] = df['Transported'].astype(int)

for col in cols:
    df[col] = df[col].astype(int)
    test[col] = test[col].astype(int)

In [14]:
# One hot encoding other categorical values
cols = ['Destination', 'HomePlanet', 'deck', 'AgeRange']
#cols = ['Destination', 'HomePlanet', 'deck']
data = df
for col1 in cols:
    dummies = pd.get_dummies(df[col1], dtype=int)
    data = pd.concat([data, dummies], axis=1)

datat = test
for col2 in cols:
    dummies = pd.get_dummies(test[col2], dtype=int)
    datat = pd.concat([datat, dummies], axis=1)
data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 7736 entries, 0 to 8692
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   HomePlanet     7736 non-null   object  
 1   CryoSleep      7736 non-null   int32   
 2   Destination    7736 non-null   object  
 3   VIP            7736 non-null   int32   
 4   RoomService    7736 non-null   float64 
 5   FoodCourt      7736 non-null   float64 
 6   ShoppingMall   7736 non-null   float64 
 7   Spa            7736 non-null   float64 
 8   VRDeck         7736 non-null   float64 
 9   Transported    7736 non-null   int32   
 10  Paid           7736 non-null   float64 
 11  deck           7736 non-null   object  
 12  side           7736 non-null   int32   
 13  AgeRange       7736 non-null   category
 14  55 Cancri e    7736 non-null   int32   
 15  PSO J318.5-22  7736 non-null   int32   
 16  TRAPPIST-1e    7736 non-null   int32   
 17  Earth          7736 non-null   int32  

In [15]:
# Dropping unimportant columns
data.drop(data[cols], axis=1, inplace=True)
datat.drop(datat[cols], axis=1, inplace=True)


In [16]:
# Train test split (+Normalization)
X, y = data.drop('Transported', axis=1), data['Transported']

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=60)

# X_train = scaler.fit_transform(X_train)
# X_test = scaler.fit_transform(X_test)


In [17]:
# Training Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('train accuracy = ' , accuracy_score(y_train, model.predict(X_train)))
print('test accuracy  = ' , accuracy_score(y_test, preds))


train accuracy =  0.9238724504452743
test accuracy  =  0.813953488372093


In [24]:
# Training XGBoost model
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('train accuracy = ' , accuracy_score(y_train, model.predict(X_train)))
print('test accuracy  = ' , accuracy_score(y_test, preds))

train accuracy =  0.8987359954036197
test accuracy  =  0.8242894056847545


In [19]:
# # Create and train SVM 
# svm_classifier = SVC(kernel='linear', C=1.0)
# svm_classifier.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = svm_classifier.predict(X_test)

# # Calculate the accuracy of the classifier
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")


In [20]:
# # Training a neural network
# model = keras.Sequential([
#     keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
#     keras.layers.Dense(64, activation='relu'),
#     keras.layers.Dense(1, activation='sigmoid')
# ])

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# model.fit(X_train, y_train, epochs=50, batch_size=32)

# train_probs = model.predict(X_train)
# train_preds = (train_probs > 0.5).astype(int).flatten()

# test_probs = model.predict(X_test)
# test_preds = (test_probs > 0.5).astype(int).flatten()

# train_accuracy = accuracy_score(y_train, train_preds)
# print(f"Training Accuracy: {train_accuracy:.2f}")

# test_accuracy = accuracy_score(y_test, test_preds)
# print(f"Test Accuracy: {test_accuracy:.2f}")


In [21]:
# # Predicting test data with a neural network

# model = keras.Sequential([
#     keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
#     keras.layers.Dense(64, activation='relu'),
#     keras.layers.Dense(1, activation='sigmoid')
# ])

# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# model.fit(X, y, epochs=50, batch_size=32)

# predictions = model.predict(datat)
# predictions = (predictions > 0.5).astype(int).flatten()

# predicted_labels = np.argmax(predictions, axis=1)



In [22]:
# # Submitting for the neural network (for Kaggle)
# submission_df = pd.DataFrame({
#     'PassengerId': raw_test['PassengerId'],
#     'Transported': predicted_labels.astype(bool)
# })
# submission_df.to_csv('submit.csv', index = False)

In [25]:
# Submitting for other models (for Kaggle)
model.fit(X, y)
datat = datat.astype(int)
predictions = model.predict(datat)
submission_df = pd.DataFrame({
    'PassengerId': raw_test['PassengerId'],
    'Transported': predictions.astype(bool)
})
submission_df.to_csv('submit.csv', index = False)