# Spaceship titanic

## Import

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import neighbors
import torch
from torch import nn
import torch.nn.functional as functional
import copy

# from gudkit.ml import CustomDataset, preprocessing, see_correlation, knn_param_search, MLP, traning_mlp
import gudkit

## Analysis of the dataframe

In [None]:
for dirname, _, filenames in os.walk('/home/gu/ros_ws/src/kaggle-competitions/datas'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
og_df = pd.read_csv("/home/gu/ros_ws/src/kaggle-competitions/datas/in/spaceship_titanic_train.csv")
og_test_df = pd.read_csv("/home/gu/ros_ws/src/kaggle-competitions/datas/in/spaceship_titanic_test.csv")

In [None]:
og_df.describe()

In [None]:
og_df.info()

In [None]:
columns = [ 'Age', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
fig, axs = plt.subplots(len(columns),1,  figsize=(10, 10))
plt.subplots_adjust(top = 2)
for i, ax in enumerate(axs):
    ax.set_title(columns[i])
    ax.hist(og_df[columns[i]], bins=50)

In [None]:
og_df.head()

## Preprocessing

In [None]:
df = preprocessing(og_df)
test_df = preprocessing(og_test_df)
df.head()

In [None]:
see_correlation(df, 'Transported')

## Preparing data for feeding to models

In [None]:
df_train, df_valid = torch.utils.data.random_split(df, [0.8, 0.2])
dt_train = gudkit.ml.CustomDataset(df_train.dataset, 'Transported')
dt_valid = gudkit.ml.CustomDataset(df_valid.dataset, 'Transported')
kwargs = {'batch_size': 100, 'num_workers': 6}
loader_train = torch.utils.data.DataLoader(dt_train, **kwargs, shuffle=True)
loader_valid = torch.utils.data.DataLoader(dt_valid, **kwargs, shuffle=True)

### KNN

#### Training

In [None]:
classifier, data = gudkit.ml.knn_param_search(dt_train.xs.values, dt_train.ys.values, dt_valid.xs.values, dt_valid.ys.values)

#### Predictions

In [None]:
X_test = test_df.values
predictions = classifier.predict(X_test)
og_test_df['Transported'] = predictions.astype(bool)
result = og_test_df.iloc[:, [0, -1]]
result.to_csv('/home/gu/ros_ws/src/kaggle-competitions/datas/out/knn_results.csv', index=False)

### MLP

In [None]:
best_model = gudkit.ml.traning_mlp(loader_train, loader_valid)

In [None]:
model = best_model[0]
og_test_df['Transported'] = model.predict(test_df.values)
result = og_test_df.iloc[:, [0, -1]]
result.to_csv('/home/gu/ros_ws/src/kaggle-competitions/datas/out/mlp_results.csv', index=False)