In [85]:
import pandas as pd
import bisect
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

## Load and Split Data

In [105]:
data = pd.read_csv('train.csv')
data.drop(columns = ['PassengerId', 'Name'], inplace = True)
data.fillna('-1', inplace = True)
y = data['Survived']
x = data.drop(columns = ['Survived'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

## Feature Engineering 

In [106]:
# Min Max Scaler for age and flare
age_scaler = MinMaxScaler()
fare_scaler = MinMaxScaler()
X_train['Age'] = age_scaler.fit_transform(X_train['Age'].values.reshape(-1,1))
X_train['Fare'] = age_scaler.fit_transform(X_train['Fare'].values.reshape(-1,1))
X_test['Age'] = age_scaler.transform(X_test['Age'].values.reshape(-1,1))
X_test['Fare'] = age_scaler.transform(X_test['Fare'].values.reshape(-1,1))

# Get features from cabin and ticket
X_train['Cabin'] = list(map(lambda x: str([c for c in x][0]), X_train['Cabin']))
X_train['Ticket'] = list(map(lambda x: [c for c in x if c.isalpha()], X_train['Ticket']))
X_train['Ticket'] = list(map(lambda x: ''.join(x), X_train['Ticket']))
X_test['Cabin'] = list(map(lambda x: str([c for c in x][0]), X_test['Cabin']))
X_test['Ticket'] = list(map(lambda x: [c for c in x if c.isalpha()], X_test['Ticket']))
X_test['Ticket'] = list(map(lambda x: ''.join(x), X_test['Ticket']))

# Encode non_numeric features
sex_encoder = LabelEncoder().fit(X_train['Sex'].values)
ticket_encoder = LabelEncoder().fit(X_train['Ticket'].values)
cabin_encoder = LabelEncoder().fit(X_train['Cabin'].values)
embarked_encoder = LabelEncoder().fit(X_train['Embarked'].values)

X_test['Sex'] = X_test['Sex'].map(lambda s: '<unknown>' if s not in sex_encoder.classes_ else s)
X_test['Ticket'] = X_test['Ticket'].map(lambda s: '<unknown>' if s not in ticket_encoder.classes_ else s)
X_test['Cabin'] = X_test['Cabin'].map(lambda s: '<unknown>' if s not in cabin_encoder.classes_ else s)
X_test['Embarked'] = X_test['Embarked'].map(lambda s: '<unknown>' if s not in embarked_encoder.classes_ else s)

sex_encoder.classes_ = np.append(sex_encoder.classes_, '<unknown>')
ticket_encoder.classes_ = np.append(ticket_encoder.classes_, '<unknown>')
cabin_encoder.classes_ = np.append(cabin_encoder.classes_, '<unknown>')
embarked_encoder.classes_ = np.append(embarked_encoder.classes_, '<unknown>')

X_train['Sex'] = sex_encoder.transform(X_train['Sex'].values)
X_train['Ticket'] = ticket_encoder.transform(X_train['Ticket'].values)
X_train['Cabin'] = cabin_encoder.transform(X_train['Cabin'].values)
X_train['Embarked'] = embarked_encoder.transform(X_train['Embarked'].values)

X_test['Sex'] = sex_encoder.transform(X_test['Sex'].values)
X_test['Ticket'] = ticket_encoder.transform(X_test['Ticket'].values)
X_test['Cabin'] = cabin_encoder.transform(X_test['Cabin'].values)
X_test['Embarked'] = embarked_encoder.transform(X_test['Embarked'].values)

#Transform to matrix and vectors
X_test = X_test.values
X_train = X_train.values
y_test = y_test.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)

print('X train shape: ', X_train.shape, '-- y train shape: ', y_train.shape)
print('X test shape: ', X_test.shape, '-- y test shape: ', y_test.shape)

X train shape:  (596, 9) -- y train shape:  (596, 1)
X test shape:  (295, 9) -- y test shape:  (295, 1)
