### Importing the libraries to be used for the project

In [None]:
#!pip install missingpy

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from scipy import stats
from sklearn.impute import KNNImputer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
#from missingpy import MissForest

## Data Exploration and Preprocessing

In [2]:
df_gender = pd.read_csv('gender_submission.csv')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df_train['Survived'].value_counts()/df_train.shape[0]

In [None]:
def nulls_summary_table(df):
    """
    Returns a summary table showing null value counts and percentage
    
    Parameters:
    df (DataFrame): Dataframe to check
    
    Returns:
    null_values (DataFrame)
    """
    null_values = pd.DataFrame(df.isnull().sum())
    null_values[1] = null_values[0]/len(df)
    null_values.columns = ['null_count','null_pct']
    return null_values

nulls_summary_table(df_train)

The below features hold very little predictive power, they hinder the models performance. Fare has very high correlation with Class, therefore I picked Class before it has a lower p-value than Fare against Survived

In [None]:
#df_test.drop(labels=['PassengerId','Cabin','Name','Ticket','SibSp','Fare','Age'], axis=1, inplace=True)

In [None]:
#df_test.shape

In [None]:
#df_gender.drop(labels=['PassengerId'], axis=1, inplace=True)

In [None]:
#df_gender.shape

The below features hold very little predictive power, they hinder the models performance. Fare has very high correlation with Class, therefore I picked Class before it has a lower p-value than Fare against Survived

In [3]:
df_train.drop(labels=['PassengerId','Cabin','Name','Ticket', 'Age'], axis=1, inplace=True)

In [4]:
df_train['Sex'] = df_train['Sex'].map({'male':0, 'female':1}).astype('category')

In [5]:
df_train['Embarked'] = df_train['Embarked'].map({'S':1,'C':2,'Q':3}).astype('category')

In [6]:
df_train.dropna(axis=0, inplace=True)

In [7]:
df_train.shape

(889, 7)

In [11]:
df_train = df_train.astype(float)

In [None]:
df_attributes = ['Pclass','Sex','Parch','Embarked']
for parameter in df_attributes:
    pearson_coef, p_value = stats.pearsonr(df_train[parameter], df_train['Survived'])
    print(parameter)
    print('The Pearson Correlation Coefficient for ', parameter, ' is ', pearson_coef, 'with a P-value of P =', p_value)

In [12]:
df = df_train.copy()

In [13]:
inputs = df.iloc[:,1:]
targets = df.iloc[:,0]

In [15]:
inputs = inputs.to_numpy()

In [16]:
targets = targets.to_numpy()

## Balancing the data

In [17]:
# Count how many targets are 1 (meaning that the customer did convert)
num_one_targets = int(np.sum(targets))

# Set a counter for targets that are 0 (meaning that the customer did not convert)
zero_targets_counter = 0

# We want to create a "balanced" dataset, so we will have to remove some input/target pairs.
# Declare a variable that will do that:
indices_to_remove = []

# Count the number of targets that are 0. 
# Once there are as many 0s as 1s, mark entries where the target is 0.
for i in range(targets.shape[0]):
    if targets[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

# Create two new variables, one that will contain the inputs, and one that will contain the targets.
# We delete all indices that we marked "to remove" in the loop above.
unscaled_inputs_equal_priors = np.delete(inputs, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets, indices_to_remove, axis=0)

In [18]:
transform = StandardScaler()
scaled_inputs = transform.fit_transform(unscaled_inputs_equal_priors)

In [19]:
scaled_inputs

array([[ 0.90589762, -0.84430228,  0.4382279 , -0.51751726, -0.53041081,
        -0.59951774],
       [-1.42306413,  1.18440992,  0.4382279 , -0.51751726,  0.63091985,
         0.96845174],
       [ 0.90589762,  1.18440992, -0.51080067, -0.51751726, -0.51816877,
        -0.59951774],
       ...,
       [-0.25858325,  1.18440992, -0.51080067,  0.79558624, -0.19035415,
        -0.59951774],
       [-1.42306413,  1.18440992, -0.51080067, -0.51751726, -0.11780873,
        -0.59951774],
       [-1.42306413, -0.84430228, -0.51080067, -0.51751726, -0.11780873,
         0.96845174]])

### Shuffle The Data

In [20]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

## Split the dataset into train, validation and test

In [21]:
# Count the total number of samples
samples_count = shuffled_inputs.shape[0]

# Count the samples in each subset, assuming we want 80-10-10 distribution of training, validation, and test.
# Naturally, the numbers are integers.
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

# The 'test' dataset contains all remaining data.
test_samples_count = samples_count - train_samples_count - validation_samples_count

# Create variables that record the inputs and targets for training
# In our shuffled dataset, they are the first "train_samples_count" observations
train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

# Create variables that record the inputs and targets for validation.
# They are the next "validation_samples_count" observations, folllowing the "train_samples_count" we already assigned
validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

# Create variables that record the inputs and targets for test.
# They are everything that is remaining.
test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

# We balanced our dataset to be 50-50 (for targets 0 and 1), but the training, validation, and test were 
# taken from a shuffled dataset. Check if they are balanced, too. Note that each time you rerun this code, 
# you will get different values, as each time they are shuffled randomly.
# Normally you preprocess ONCE, so you need not rerun this code once it is done.
# If you rerun this whole sheet, the npzs will be overwritten with your newly preprocessed data.

# Print the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test.
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

276.0 544 0.5073529411764706
29.0 68 0.4264705882352941
35.0 68 0.5147058823529411


## Save the datasets in *.npz

In [22]:
np.savez('Titanic_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Titanic_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Titanic_data_test', inputs=test_inputs, targets=test_targets)