### Importing libraries and data

In [None]:
# imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [None]:
print("Source:", "https://www.kaggle.com/c/titanic/data")

# reading the data from a csv file
data = pd.read_csv('../../data/day_3/titanic/titanic.csv')

data

In [None]:
# reading the metadata from a text file
with open('../../data/day_3/titanic/titanic_meta.txt', 'r', encoding='utf-8') as f:
    inhalt = f.read()
    print(inhalt)

### Train-Test split

In [None]:
# drop the "survived" column from the data
input_data = data.drop(columns=["survived"])

# define the target variable separately
target_variable = data["survived"]

# perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(input_data, target_variable, test_size=0.2, random_state=42)

# show the length of the train set
init_length_X_train = len(X_train)
print("Train set size:", init_length_X_train)

### Data cleaning

In [None]:
# check the data types of the columns
print("Spalte, Eingelesener datentyp", "\n")

object_cols = []
for col in X_train.columns:
    print(f"{col}, {X_train[col].dtype}")
    if X_train[col].dtype == "object":
        object_cols.append(col)
        
print("\n", "MÃ¶glicherweise fehlerhafte Spalten:", object_cols)

In [None]:
# show the unique values in each column marked as object type
for col in object_cols:
    print(col, X_train[col].unique())

In [None]:
# print the number of missing values in each column
missing_values = X_train.isnull().sum()
print(missing_values, "\n")

In [None]:
# drop the columns with less than X% missing values

# define the threshold and calculate the absolute threshold number
drop_below_relative = 0.01
drop_below_absolute = drop_below_relative * len(X_train)

# find out which columns have a number missing values below the threshold
drop_cols = missing_values[(missing_values > 0) & (missing_values < drop_below_absolute)].index.tolist()
print("The rows with missing values of the following columns will be dropped:", "\n", drop_cols, "\n\n")

# find out which columns have a number missing values above the threshold
not_drop_cols = missing_values[(missing_values > 0) & (missing_values > drop_below_absolute)].index.tolist()
print("The rows with missing values of the following columns will be NOT dropped:", "\n", not_drop_cols)

In [None]:
# save the rows to drop in a set to prevent duplicates
rows_to_drop = set()

# get the index of rows with missing values in these columns
for col in drop_cols:
    nan_rows = X_train[X_train[col].isnull()].index
    rows_to_drop.update(nan_rows)

# convert the set to a list
rows_to_drop = list(rows_to_drop)

# drop the rows from the dataset
X_train.drop(rows_to_drop, inplace=True)
X_train.reset_index(drop=True, inplace=True)

# drop the rows from the target variable
y_train.drop(rows_to_drop, inplace=True)
y_train.reset_index(drop=True, inplace=True)

print("Dropped", init_length_X_train-len(X_train), "rows with missing values")

In [None]:
# impute the remaining missing values

# imputation by random sampling from the correct data of the column with missing data
for col in not_drop_cols:
    missing_mask = X_train[col].isnull()
    num_imputed_values = missing_mask.sum()

    sampled_values = X_train.loc[~missing_mask, col].sample(
        n=num_imputed_values, 
        replace=True, 
        random_state=42
    ).values

    X_train.loc[missing_mask, col] = sampled_values
    
    print(f"Imputed {num_imputed_values} values in column '{col}'", "\n")

In [None]:
# remove redundance

# get the number of duplicate rows in the dataset
num_duplicate_rows = X_train.duplicated().sum()
print(f"Number of identical rows in the dataset: {num_duplicate_rows}", "\n")

print("Are those correct or incorrect duplicates?", "\n")

# remove redundant columns
X_train.drop(columns=["embark_town"], inplace=True)
print("Dropped 'embark_town' column as it is redundant to 'embarked' column")

### Encodings

In [None]:
# ordinal encoding - with order

# define columns and mapping for the ordinal encoding
ordinal_cols = ['class']
ordinal_map = [['Third', 'Second', 'First']]

# initialize the OrdinalEncoder with the specified categories
ordinal_encoder = OrdinalEncoder(categories=ordinal_map)
X_train[ordinal_cols] = ordinal_encoder.fit_transform(X_train[ordinal_cols])

X_train

In [None]:
# nominal encoding - without order (One-Hot Encoding)

# define columns and for the nominal encoding
nominal_cols = ['sex', 'embarked', 'who', 'deck']

# use pandas inbuilt functions to one-hot encode the nominal columns
X_train = pd.get_dummies(X_train, columns=nominal_cols, drop_first=True)

X_train

### Scaling

In [None]:
# convert the data set to float64
X_train = X_train.astype('float64')

# initialise the scaler
scaler = StandardScaler()

# list of columns to be scaled
columns_to_scale = ['age', 'fare']

# fit & transform the columns
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

X_train

### Feature engineering

In [None]:
# the familiy size could be an indicator of survival
X_train["family_size"] = X_train["sibsp"] + X_train["parch"] + 1

# a combination of "age" and "pclass" could be an indicator of status and therefore survival
X_train["age_class_interaction"] = X_train["age"] * X_train["pclass"]

X_train

### Processing the test data

We need to process the test data in a similar way we processeds the training data.

Otherwise the model will miss some of the artificial columns and values we created.

We have to be very careful to not leak any information from the test set into our model, or from the training set into the test set.

In [None]:
# drop the redundant column
X_test.drop(columns=['embark_town'], inplace=True)

# impute missing values by random sampling ot the test data
for col in X_test.columns:
    if X_test[col].isnull:
        missing_mask_test = X_test[col].isnull()
        num_missing_test = missing_mask_test.sum()
        if num_missing_test > 0:
            sampled_values_test = X_test.loc[~missing_mask_test, col].sample(
                n=num_missing_test,
                replace=True,
                random_state=42
            ).values
            X_test.loc[missing_mask_test, col] = sampled_values_test

# ordinal encoding
X_test[ordinal_cols] = ordinal_encoder.transform(X_test[ordinal_cols])

# one-hot encoding
X_test = pd.get_dummies(X_test, columns=nominal_cols, drop_first=True)

# add missing dummy columns (if they are not present in the test data)
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0

# convert the test data to float64 and reset the indices
X_test = X_test.astype('float64')
X_test.reset_index(drop=True, inplace=True)

# scale the corresponding columns
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale]) # NO FIT-TRANSFORM HERE!

# add the engineered features to the test data
X_test["family_size"]  = X_test["sibsp"] + X_test["parch"] + 1
X_test["age_class_interaction"]  = X_test["age"] * X_test["pclass"]

X_test