In [None]:
import sklearn
import torch.optim.optimizer
from sklearn import tree
import os
import numpy as np
import pandas as pd
import kcu as utils
import matplotlib.pyplot as plt
import seaborn as sns
import torch.optim as optim
import pyspark

from kcu import dataset

dataset.TitanicDataset().get_dataset()

cwdir = os.getcwd()
trainfile = cwdir + "/../../data/titanic/train.csv"
testfile = cwdir + "/../../data/titanic/test.csv"
train_data_pd = pd.read_csv(trainfile)

# Plot some samples:
print(train_data_pd.head())

# lets get some statistics
train_data_pd.describe()

profile = utils.utils.report_dataframe(train_data_pd)

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
          .appName("SparkByExamples.com") \
          .getOrCreate()
df = spark.read.options(header='True',).csv(trainfile)

In [None]:
df

In [None]:
# This report gives us a lot of information, namely:
# * "Name" has many different values
# * There are several strong/significant correlations
# * IDs are unique (can be neglected as features) -> Feature selection

features = ['Pclass', 'Sex', 'SibSp', 'Parch']
X_train = pd.get_dummies(train_data_pd[features])
# Lets drop one of the two columns "Sex_male"/"Sex_female" as there is no information gain
X_train = X_train.drop("Sex_male", axis=1)
Y_train = train_data_pd['Survived']

#multi_corr = utils.utils.multiple_correlation(train_data_pd[['Pclass', 'SibSp', 'Parch', "Survived"]], "Survived")
#plt.show()

utils.utils.correlation_matrix(pd.DataFrame(X_train))

In [None]:
#utils.boilerplates.determine_durations(len(X_train.columns), 10000, sklearn.svm.SVC())

train_inds, val_inds = sklearn.model_selection.train_test_split(
    np.arange(len(Y_train)), test_size=0.2
)
train_X, val_X = X_train.iloc[train_inds], X_train.iloc[val_inds]
train_Y, val_Y = Y_train.iloc[train_inds], Y_train.iloc[val_inds]

# Try several classifiers using kFold CrossValidation
performances = utils.boilerplates.run_several_classifiers(X_train, Y_train, cv=True)

# Lets report their performances
chart = sns.boxplot(x="method", y="accuracy", data=performances)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title("Accuracy of various methods using 5-fold CV")
plt.tight_layout()
plt.show()

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = utils.models.ResNet.make_baseline(
     d_in=4,
     d=16,
     d_intermidiate=16,
     dropout_first=0.2,
     dropout_second=0.0,
     n_blocks=2,
     d_out=2
).to(device)
lr = 0.001
weight_decay = 0.0

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

train_dset = TensorDataset(torch.Tensor(train_X.to_numpy()),
                           torch.Tensor(train_Y.to_numpy()).long()) # create your datset
train_loader = DataLoader(train_dset, batch_size=128, shuffle=True)
val_dset = TensorDataset(torch.Tensor(val_X.to_numpy()),
                         torch.Tensor(val_Y.to_numpy()).long()) # create your datset
val_loader = DataLoader(val_dset, batch_size=128, shuffle=False)

utils.boilerplates.train_classifier(model, optimizer, train_loader,
                                    device, 20, nn.CrossEntropyLoss(), val_loader, True)