# Titanic - A Kaggle Submission
## By Michael Neiman

## Data Import:

In [16]:
import pandas as pd

X_test = pd.read_csv('titanic_test.csv', header=0)
X_train = pd.read_csv('titanic_train.csv', header=0)

## 1. Data Preprocessing

In [17]:
# Create copies of the data:
X_train_preprocessed = X_train.copy()
X_test_preprocessed = X_test.copy()

### 1.1. Parsing data columns:

The "cabin" column is actually comprised of two pieces of data which may be relevant - The deck and the cabin number. In some cases there seem to be multiple numbers for one passenger. In such cases I will average the cabin numbers, as they tend to be close to one another.

In [22]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

def mean_of_identically_named_columns(df: pd.DataFrame):
    import numbers
    return df.groupby(by=df.columns, axis=1).apply(lambda g: g.mean(axis=1) if isinstance(g.iloc[0,0], numbers.Number) else g.iloc[:,0])

def encode_decks(cabin_data: pd.DataFrame):
    if ordinal_encoder.categories:
        cabin_data = ordinal_encoder.transform(cabin_data['deck'])
    else:
        cabin_data = ordinal_encoder.fit_transform(cabin_data['deck'])
    return cabin_data

def get_deck_and_cabin_number(data: pd.DataFrame):
    possible_cabins = data['cabin'].str.split(" ", expand = True)
    cabins_data = pd.DataFrame()
    cabin_data = pd.DataFrame()
    for data, idx in enumerate(possible_cabins):
        cabin_data[['deck', 'number']] = possible_cabins.loc[:, idx].str.extract('([a-zA-Z]+)([^a-zA-Z]+)', expand = True)
        cabin_data = encode_decks(cabin_data)
        cabin_data['number'] = cabin_data['number'].astype(float)
        cabins_data = pd.concat([cabins_data, cabin_data], axis=1)
    cabins_data = mean_of_identically_named_columns(cabins_data)
    data[['deck', 'cabin_number']] = cabins_data
    del data['cabin']

X_train_preprocessed = get_deck_and_cabin_number(X_train_preprocessed)
X_test_preprocessed = get_deck_and_cabin_number(X_test_preprocessed)
X_train_preprocessed.head()

KeyboardInterrupt: 

### 2. Encoding Categorical values:

In [None]:
# Ordinal encoding for the port of embarkation data:

cols_ordinal = ['embarked']
X_train_preprocessed['embarked'] = ordinal_encoder.fit_transform(X_train['embarked'])
X_test_preprocessed['embarked'] = ordinal_encoder.transform(X_test['embarked'])