## Import relevant packages

In [None]:
# Import relevant packages 
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC 

#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.naive_bayes import GaussianNB

## Load the data

In [None]:
data = pd.read_csv('/kaggle/input/spaceship-titanic-eik-lab/train.csv')
print("Full train dataset shape is {}".format(data.shape))

## Explore the data

In [None]:
data.head(5)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
plot_df = data.Transported.value_counts()
plot_df.plot(kind="bar")

In [None]:
fig, ax = plt.subplots(5,1,  figsize=(10, 10))
plt.subplots_adjust(top = 2)

sns.histplot(data['Age'], color='b', bins=50, ax=ax[0]);
sns.histplot(data['FoodCourt'], color='b', bins=50, ax=ax[1]);
sns.histplot(data['ShoppingMall'], color='b', bins=50, ax=ax[2]);
sns.histplot(data['Spa'], color='b', bins=50, ax=ax[3]);
sns.histplot(data['VRDeck'], color='b', bins=50, ax=ax[4]);

In [None]:
data.boxplot()
plt.xticks(rotation=45, ha='right')
plt.show()

## Clean the dataset

In [None]:
# separates the target from the data 
y = data.Transported
X=data.iloc[:,0:13]

In [None]:
# PassengerID and Name are not necessary for training
X = X.drop(['PassengerId', 'Name'], axis=1)

In [None]:
# Split cabin into three separates variables 
X[["Deck", "Cabin_num", "Side"]] = X["Cabin"].str.split("/", expand=True)
X = X.drop('Cabin', axis=1)

In [None]:
# changing the categorical data to numerical values 

class_labels = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Cabin_num', 'Side']

for cl in class_labels:
    class_le = LabelEncoder()
    Y_le = class_le.fit_transform(X[cl].values)
    X[cl] = Y_le

y = LabelEncoder().fit_transform(y.values)

In [None]:
# Check for NaN values
print('Column number of missing values')
for c in X.columns:
    n_NaN = X[c].isnull().sum()
    print(f'{c:32} {n_NaN}')

In [None]:
# There NaN values in several of the categories which needs to be removed

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(X.values)
imputed_data = imr.transform(X.values)

X = pd.DataFrame(imputed_data, columns=X.columns)


In [None]:
# Scale the entire dataset 
X_scaled = (X - np.mean(X, axis=0)) / np.std(X)

In [None]:
X_scaled.boxplot()
plt.xticks(rotation=45, ha='right')
plt.show()

## Prepare the dataset for training

In [None]:
# Split the dataset into training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6, stratify=y)

In [None]:
# Scale the data
X_train_scale = (X_train - np.mean(X_train, axis=0)) / np.std(X_train)
X_test_scale = (X_test - np.mean(X_train, axis=0)) / np.std(X_train)

## Train the model

#### List of classifiers 
* Perceptron
* Adaline 
* Logistic Regression
* Support vector machine
* K-nearest Neighbour 
* Decision trees
* Random forest 
* Naive Bayes 
* Gradient Boosting 

In [None]:
# Start the classifier
classifier = SVC()

In [None]:
# Train the classifier
classifier.fit(X_train, y_train)

In [None]:
# Predict on the test set and calculate the accuracy
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)

## Load the test dataset

In [None]:
test_data = pd.read_csv('/kaggle/input/spaceship-titanic-eik-lab/test.csv')

submission_id = test_data.PassengerId

# Remove Name and passengerID
test_data = test_data.drop(['PassengerId', 'Name'], axis=1)

# Split cabin into three variables 
test_data[["Deck", "Cabin_num", "Side"]] = test_data["Cabin"].str.split("/", expand=True)
test_data = test_data.drop('Cabin', axis=1)

# Change the categorical values to numerical values
class_labels = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Cabin_num', 'Side']

for cl in class_labels:
    class_le = LabelEncoder()
    Y_le = class_le.fit_transform(test_data[cl].values)
    test_data[cl] = Y_le

# replace Nan values
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(test_data.values)
imputed_data = imr.transform(test_data.values)

test_data = pd.DataFrame(imputed_data, columns=test_data.columns)

# Scale the test data
test_data_scaled = (test_data - np.mean(X, axis=0)) / np.std(X)


In [None]:
# Make predictions for the test dataset
submission = SVC()
submission.fit(X_scaled, y)
predictions = submission.predict(test_data_scaled)
bool_predictions = (predictions > 0.5).astype(bool)

output = pd.DataFrame({'PassengerId': submission_id,
                       'Transported': bool_predictions})

output.head()

In [None]:
sample_submission_df = pd.read_csv('/kaggle/input/spaceship-titanic-eik-lab/sample_submission.csv')
sample_submission_df['Transported'] = bool_predictions
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()