In [10]:
import pandas as pd
import bisect
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

## Load and Split Data

In [19]:
def split_data(x, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size)
    return X_train, X_test, y_train, y_test

## Feature Engineering 

In [33]:
def get_features(X_train, X_test, y_train, y_test):
    # Min Max Scaler for age and flare
    age_scaler = MinMaxScaler()
    fare_scaler = MinMaxScaler()
    X_train['Age'] = age_scaler.fit_transform(X_train['Age'].values.reshape(-1,1))
    X_train['Fare'] = age_scaler.fit_transform(X_train['Fare'].values.reshape(-1,1))
    X_test['Age'] = age_scaler.transform(X_test['Age'].values.reshape(-1,1))
    X_test['Fare'] = age_scaler.transform(X_test['Fare'].values.reshape(-1,1))

    # Get features from cabin and ticket
    X_train['Cabin'] = list(map(lambda x: str([c for c in x][0]), X_train['Cabin']))
    X_train['Ticket'] = list(map(lambda x: [c for c in x if c.isalpha()], X_train['Ticket']))
    X_train['Ticket'] = list(map(lambda x: ''.join(x), X_train['Ticket']))
    X_test['Cabin'] = list(map(lambda x: str([c for c in x][0]), X_test['Cabin']))
    X_test['Ticket'] = list(map(lambda x: [c for c in x if c.isalpha()], X_test['Ticket']))
    X_test['Ticket'] = list(map(lambda x: ''.join(x), X_test['Ticket']))

    # Encode non_numeric features
    sex_encoder = LabelEncoder().fit(X_train['Sex'].values)
    ticket_encoder = LabelEncoder().fit(X_train['Ticket'].values)
    cabin_encoder = LabelEncoder().fit(X_train['Cabin'].values)
    embarked_encoder = LabelEncoder().fit(X_train['Embarked'].values)

    X_test['Sex'] = X_test['Sex'].map(lambda s: '<unknown>' if s not in sex_encoder.classes_ else s)
    X_test['Ticket'] = X_test['Ticket'].map(lambda s: '<unknown>' if s not in ticket_encoder.classes_ else s)
    X_test['Cabin'] = X_test['Cabin'].map(lambda s: '<unknown>' if s not in cabin_encoder.classes_ else s)
    X_test['Embarked'] = X_test['Embarked'].map(lambda s: '<unknown>' if s not in embarked_encoder.classes_ else s)

    sex_encoder.classes_ = np.append(sex_encoder.classes_, '<unknown>')
    ticket_encoder.classes_ = np.append(ticket_encoder.classes_, '<unknown>')
    cabin_encoder.classes_ = np.append(cabin_encoder.classes_, '<unknown>')
    embarked_encoder.classes_ = np.append(embarked_encoder.classes_, '<unknown>')

    X_train['Sex'] = sex_encoder.transform(X_train['Sex'].values)
    X_train['Ticket'] = ticket_encoder.transform(X_train['Ticket'].values)
    X_train['Cabin'] = cabin_encoder.transform(X_train['Cabin'].values)
    X_train['Embarked'] = embarked_encoder.transform(X_train['Embarked'].values)

    X_test['Sex'] = sex_encoder.transform(X_test['Sex'].values)
    X_test['Ticket'] = ticket_encoder.transform(X_test['Ticket'].values)
    X_test['Cabin'] = cabin_encoder.transform(X_test['Cabin'].values)
    X_test['Embarked'] = embarked_encoder.transform(X_test['Embarked'].values)

    #Transform to matrix and vectors
    X_test = X_test.values
    X_train = X_train.values
    y_test = y_test.values.reshape(-1,1)
    y_train = y_train.values.reshape(-1,1)

#     print('X train shape: ', X_train.shape, '-- y train shape: ', y_train.shape)
#     print('X test shape: ', X_test.shape, '-- y test shape: ', y_test.shape)
    
    encoders = {'age': age_scaler, 'fare': fare_scaler, 'sex': sex_encoder, 
                'ticket': ticket_encoder, 'cabin': cabin_encoder, 'embarked': embarked_encoder}
    
    train_data = [X_train, y_train]
    
    test_data = [X_test, y_test]
    
    return X_train, X_test, y_train, y_test, encoders

## Train Model

### Support Vector Classifier 

In [44]:
def cross_val_SVM(x, y, cross_vals = 5):
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    results = []
    for k in kernels:
        precision = []
        recall = []
        for i in range(cross_vals):
            svc = SVC(kernel=k)
            X_train, X_test, y_train, y_test = split_data(x, y, 0.33)
            X_train, X_test, y_train, y_test, encoders = get_features(X_train, X_test, y_train, y_test)
            svc.fit(X_train,y_train)
            y_pred = svc.predict(X_test)
            report = classification_report(y_test,y_pred, output_dict=True)
            precision.append(report['macro avg']['precision'])
            recall.append(report['macro avg']['recall'])
        results.append({'kernel': k, 'avg_prec': sum(precision)/len(precision), 'avg_recall': sum(recall)/len(recall),
                       'prec': precision, 'recall': recall})
        
    return pd.DataFrame(results)        

## Test

In [46]:
data = pd.read_csv('train.csv')
data.drop(columns = ['PassengerId', 'Name'], inplace = True)
data.fillna('-1', inplace = True)
y = data['Survived']
x = data.drop(columns = ['Survived'])
svm_results = cross_val_SVM(x, y, cross_vals = 20)
display(svm_results)

Unnamed: 0,kernel,avg_prec,avg_recall,prec,recall
0,linear,0.784648,0.775815,"[0.8084893048128342, 0.7912303664921466, 0.746...","[0.8102372034956304, 0.7842751842751843, 0.733..."
1,poly,0.703168,0.634138,"[0.6983216092805133, 0.6441016624040921, 0.739...","[0.6557623340118252, 0.5436900261703984, 0.743..."
2,rbf,0.769309,0.758171,"[0.7322970639032815, 0.8066846465035639, 0.788...","[0.7273314774309008, 0.8086168459823592, 0.768..."
3,sigmoid,0.637074,0.62892,"[0.6136603375527426, 0.6802337450239869, 0.619...","[0.5976389448248087, 0.6695716892346106, 0.611..."


In [43]:
svm_df = pd.DataFrame(svm_results)
display(svm_df)

Unnamed: 0,kernel,avg_prec,avg_recall,prec,recall
0,linear,0.780856,0.772941,"[0.7641242937853108, 0.7847489395284601, 0.792...","[0.7673500048463702, 0.7780292814486611, 0.780..."
1,poly,0.714385,0.61772,"[0.7735826001955034, 0.6865997381056308, 0.755...","[0.6060835781294418, 0.6239130434782609, 0.632..."
2,rbf,0.76643,0.752712,"[0.7302847464803641, 0.7982169629470979, 0.761...","[0.7210200142233059, 0.77081104735972, 0.75051..."
3,sigmoid,0.653211,0.65189,"[0.66489703989704, 0.649221278317152, 0.666461...","[0.6798245614035088, 0.6385836385836385, 0.672..."
