# Titanic prediction - competition

In [2]:
import numpy as np
import pandas as pd

# Set seed to always be used
seed = 123


def preprocess(train, test):
    train['test_train'] = 'train'
    test['test_train'] = 'test'
    data = pd.concat([test,train], sort = True)
    data['Cabin_dep'] = [cabin_no[0] for cabin_no in data['Cabin'].astype("str") if cabin_no.lower() != "nan"]
    data['Cabin_dep'] = data['Cabin_dep'].astype('category')
    
    data = data.drop(['Cabin', 'Name',], axis = 1)
    
    ### Fix the age variable
    
    from matplotlib.pyplot import hist
    import matplotlib.pyplot as plt

    from scipy.stats import gaussian_kde
    hist(data['Age'].dropna(), density = True)

    from scipy.stats import poisson

    dens = gaussian_kde(data['Age'].dropna())
    x = np.arange(0,data['Age'].dropna().max())
    plt.plot(x, dens.evaluate(x), 'r')

    dist = dens.evaluate(x)
    # Normalize

    dist = np.divide(dist,np.sum(dist))

    # We should sample out of this distribution to compensate. 
    np.random.seed(seed)
    nan_ages = np.random.choice(x, p = dist, size = data['Age'].isnull().sum(), )

    count = 0
    for i in range(data.shape[0]):
        if np.isnan(data['Age'][i]):
            data['Age'][i] = nan_ages[count]
            count += 1
            
    data['Cabin'] = data['Cabin'].fillna("Unknown")
    
    
    # Done processing. Return it
    test, train = [x for _, x in data.groupby(data['train_test'] == "train")]
    test.drop(['train_test'],axis=1)
    train.drop(['train_test'],axis=1)
    # return data
    
    return train, test
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data = pd.concat([test,train], sort = False)
data = data.reset_index()
#train, test = preprocess(train, test)


  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
print(data['Cabin'].isnull().sum())

1014


In [4]:
data['Cabin_dep'] = [cabin_no[0] for cabin_no in data['Cabin'].astype("str")]
#data['Cabin_dep'] = [None for cabin_no in data['Cabin'].astype("str")]
data['Cabin_dep'] = data['Cabin_dep'].astype("category")

## Fixing age variable

As we saw, we have many ages that are NaN. To compensate for this, we simulate the distribution and obtain new samples. Make sure to use seed.  

In [5]:
from matplotlib.pyplot import hist
import matplotlib.pyplot as plt

from scipy.stats import gaussian_kde
hist(data['Age'].dropna(), density = True)

from scipy.stats import poisson

dens = gaussian_kde(data['Age'].dropna())
x = np.arange(0,data['Age'].dropna().max())
plt.plot(x, dens.evaluate(x), 'r')

dist = dens.evaluate(x)
# Normalize

dist = np.divide(dist,np.sum(dist))

# We should sample out of this distribution to compensate. 
np.random.seed(seed)
nan_ages = np.random.choice(x, p = dist, size = data['Age'].isnull().sum(), )

count = 0
#print(data)
for i in range(data.shape[0]):
    if np.isnan(data['Age'][i]):
        data['Age'][i] = nan_ages[count]
        count += 1
        
print(data['Age'].isnull().sum())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0


In [7]:
data.isnull().sum()

index             0
PassengerId       0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Survived        418
Cabin_dep         0
dtype: int64

Okay, so we need to fix Fare as well. Just take the mean here. 

In [8]:
data['Fare'] = data['Fare'].fillna(data['Fare'].dropna().mean())

In [9]:


data['Sex'] = data['Sex'].astype('str')
data['Sex'] = data['Sex'].replace({'male':1, 'female':-1})

# Drop unnecessary frames not interesting. 

data = data.drop(['Name','Ticket','Cabin'], axis = 1)

In [11]:
train, test = [x for _, x in data.groupby(data['Survived'].isnull())]
print(train.head())
test.head()

     index  PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare Embarked  \
418      0            1       3    1  22.0      1      0   7.2500        S   
419      1            2       1   -1  38.0      1      0  71.2833        C   
420      2            3       3   -1  26.0      0      0   7.9250        S   
421      3            4       1   -1  35.0      1      0  53.1000        S   
422      4            5       3    1  35.0      0      0   8.0500        S   

     Survived Cabin_dep  
418       0.0         n  
419       1.0         C  
420       1.0         n  
421       1.0         C  
422       0.0         n  


Unnamed: 0,index,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived,Cabin_dep
0,0,892,3,1,34.5,0,0,7.8292,Q,,n
1,1,893,3,-1,47.0,1,0,7.0,S,,n
2,2,894,2,1,62.0,0,0,9.6875,Q,,n
3,3,895,3,1,27.0,0,0,8.6625,S,,n
4,4,896,3,-1,22.0,1,1,12.2875,S,,n


## Data processing done - time for modelling

First off, we simply try a ridge regression with an rbf kernel and see its performance. This is kind of unchristly as it is actually regression, but it might actually perform well. 

In [12]:
# Fix features

X = train.drop(['index', 'PassengerId','Survived', 'Embarked', 'Cabin_dep'], axis = 1)
y = train['Survived']

In [10]:
from sklearn.kernel_ridge import KernelRidge

kern_ridge = KernelRidge(alpha=1.0, kernel = 'rbf')



print(kern_ridge.get_params())

params = {'alpha': [0.01,0.1, 1.0],
 'coef0': [0,0.1,1],
 'degree': [1,2,3],
 'gamma': [0.01,1,10],
 'kernel': ['rbf']
}

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=kern_ridge, param_grid = params, n_jobs = 1, iid=False,verbose=True, scoring = 'neg_mean_squared_error', cv = 5)
grid_search.fit(X,y)
print("Best params")
print(grid_search.best_params_)


{'alpha': 1.0, 'coef0': 1, 'degree': 3, 'gamma': None, 'kernel': 'rbf', 'kernel_params': None}
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

Now, use the best hyperparameters obtained. 

In [None]:
from sklearn.model_selection import cross_val_score
best_model = KernelRidge(alpha=0.1, coef0=0, degree=1, gamma=0.01, kernel = 'rbf')

In [None]:
best_model.fit(X,y)

y_preds = best_model.predict(X)
preds = []
for i in range(len(y_preds)):
    if y_preds[i] < 0.5:
        preds.append(0)
    else:
        preds.append(1)

from sklearn.metrics import confusion_matrix

confusion_matrix(y,preds)



Pretty good. Now lets try logistic regression. 

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression()

print(log_reg_model.get_params())
X = train.drop(['index', 'PassengerId','Survived', 'Embarked', 'Cabin_dep'], axis = 1)
y = y.astype("category")
log_reg_params = {'C': [0.1,1.0,10,100], 
                  'class_weight': [None], 
                  'dual': [None], 
                  'fit_intercept': [True,False], 
                  'max_iter': [10000], 
                  'multi_class': ['ovr'], 
                  'n_jobs': [1],
                  'penalty': ['l1','l2'],
                  'random_state': [123],  
                  'tol': [0.0001, 0.0005,0.001], 
                  'solver':['saga'],
                  'warm_start': [False]}

grid_search = GridSearchCV(estimator=log_reg_model, param_grid = log_reg_params, n_jobs = 1, iid=False,verbose=True, cv = 5)
grid_search.fit(X,y)


In [None]:
print(grid_search.best_params_)

In [None]:
best_log_reg = LogisticRegression(C=10, fit_intercept=False,max_iter=10000, multi_class="ovr",n_jobs=1,penalty="l1", random_state=123, solver="saga", tol=0.0005, warm_start=False)

best_log_reg.fit(X,y)
log_reg_preds = best_log_reg.predict(X)
confusion_matrix(y, log_reg_preds)


So Logistic regression performed worse. Let's choose the Ridge Regression for the classification. 

In [None]:
X_test = test.drop(['index', 'PassengerId','Survived', 'Embarked', 'Cabin_dep'], axis = 1)
X_test.head()
y_preds_test = best_model.predict(X_test)
preds_test = []
for i in range(len(y_preds_test)):
    if y_preds_test[i] < 0.5:
        preds_test.append(0)
    else:
        preds_test.append(1)

In [None]:
submission = pd.concat([test['PassengerId'], pd.Series(preds_test)], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission.head()
submission.to_csv("submission.csv", index = False)

### XGBoost 

Let's try XGBoost instead. The score, without tuning hyperparameters or any cross-validation, resulted in 0.77, quite a good score. 

In [13]:
train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test)

X_train = train_dummies.drop(['Survived'], axis = 1)
y_train = train_dummies['Survived']
X_test = test_dummies.drop(['Survived'], axis = 1)

In [13]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

xgb_model.fit(X_train, y_train)
preds_train = xgb_model.predict(X_train)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, preds_train))

[[520  29]
 [ 77 265]]


In [14]:
preds_test = xgb_model.predict(X_test)

submission = pd.concat([test['PassengerId'], pd.Series(preds_test)], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission.head()
submission.to_csv("submission_xgb.csv", index = False)

#### Tuning the parameters of XGBoost

Let's tune the parameters for XGBoost and see how well it performs. 

In [18]:
from sklearn.model_selection import GridSearchCV

params = {
    'colsample_bytree':[0.4,0.7],
    'gamma':[0.01,0.5,0.9],
    'min_child_weight':[1,3],
    'learning_rate':[0.01,0.1,1],
    'max_depth':[3,4,5],
    'n_estimators':[500],
    'reg_alpha':[1e-5, 0.1],
    'reg_lambda':[1e-5, 0.1],
    'subsample':[0.8]
}

grid_search = GridSearchCV(estimator = xgb_model, param_grid = params,n_jobs=4, cv=5, iid=False,verbose=True, scoring = 'neg_mean_squared_error')

print("Fitting model...")
grid_search.fit(X_train, y_train)
print("Model fitted")
print("Best score: ")
print(grid_search.best_score_)
print("Best model: ")
print(grid_search.best_params_)


Fitting model...
Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   20.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   59.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  3.9min
[Parallel(n_jobs=4)]: Done 2160 out of 2160 | elapsed:  4.7min finished


Model fitted
Best score: 
-0.1873285109832764
Best model: 
{'colsample_bytree': 0.7, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 500, 'reg_alpha': 0.1, 'reg_lambda': 1e-05, 'subsample': 0.8}


Let's use the best model and predict and submit it. 

In [22]:
xgb_best_model = XGBClassifier(colsample_bytree=0.7, gamma = 0.5, learning_rate = 0.01, max_depth=5, min_child_weight=3, n_estimators = 500, reg_alpha = 0.1, reg_lambda = 1e-05, subsample = 0.8)
xgb_best_model.fit(X_train, y_train)
preds_train = xgb_best_model.predict(X_train)

print(confusion_matrix(y_train, preds_train))

preds_test = xgb_best_model.predict(X_test)

submission = pd.concat([test['PassengerId'], pd.Series(preds_test).astype("int")], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission.head()
submission.to_csv("submission_xgb.csv", index = False)

[[527  22]
 [ 65 277]]


Tuning as above led to an improvement of 2 %. Not much, but still something. 

## Gaussian process

Let's try gaussian processes instead. We use the same features as previously, i.e. X_train, y_train and X_test

In [16]:
# Again, same. 

train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test)

X_train = train_dummies.drop(['Survived'], axis = 1)
y_train = train_dummies['Survived']
X_test = test_dummies.drop(['Survived'], axis = 1)

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

GaussianProcessClassifier().get_params()

{'copy_X_train': True,
 'kernel': None,
 'max_iter_predict': 100,
 'multi_class': 'one_vs_rest',
 'n_jobs': None,
 'n_restarts_optimizer': 0,
 'optimizer': 'fmin_l_bfgs_b',
 'random_state': None,
 'warm_start': False}

In [24]:


from sklearn.model_selection import GridSearchCV

params = {'copy_X_train': [True],
 'kernel': [RBF(1.0),RBF(0.01),RBF(0.001),RBF(10), RBF(5), RBF(3), RBF(2)],
 'max_iter_predict': [100],
 'multi_class': ['one_vs_rest'],
 'n_jobs': [3],
 'n_restarts_optimizer': [0],
 'optimizer': ['fmin_l_bfgs_b'],
 'random_state': [123],
 'warm_start': [False]
         }

grid_search = GridSearchCV(estimator = GaussianProcessClassifier(), param_grid = params,n_jobs=4, cv=5, iid=False,verbose=True, scoring = 'balanced_accuracy')

print("Fitting model...")
grid_search.fit(X_train, y_train)
print("Model fitted")
print("Best score: ")
print(grid_search.best_score_)
print("Best model: ")
print(grid_search.best_params_)


Fitting model...
Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  35 out of  35 | elapsed:   35.3s finished


Model fitted
Best score: 
0.5357102999302488
Best model: 
{'copy_X_train': True, 'kernel': RBF(length_scale=3), 'max_iter_predict': 100, 'multi_class': 'one_vs_rest', 'n_jobs': 3, 'n_restarts_optimizer': 0, 'optimizer': 'fmin_l_bfgs_b', 'random_state': 123, 'warm_start': False}


In [25]:
best_gauss_process = GaussianProcessClassifier(kernel=RBF(3)).fit(X_train,y_train)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train,best_gauss_process.predict(X_train)))

[[519  30]
 [241 101]]


Well, the RBF kernel was horrible. 

In [26]:
X_train

Unnamed: 0,index,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Cabin_dep_A,Cabin_dep_B,Cabin_dep_C,Cabin_dep_D,Cabin_dep_E,Cabin_dep_F,Cabin_dep_G,Cabin_dep_T,Cabin_dep_n
418,0,1,3,1,22.0,1,0,7.2500,0,0,1,0,0,0,0,0,0,0,0,1
419,1,2,1,-1,38.0,1,0,71.2833,1,0,0,0,0,1,0,0,0,0,0,0
420,2,3,3,-1,26.0,0,0,7.9250,0,0,1,0,0,0,0,0,0,0,0,1
421,3,4,1,-1,35.0,1,0,53.1000,0,0,1,0,0,1,0,0,0,0,0,0
422,4,5,3,1,35.0,0,0,8.0500,0,0,1,0,0,0,0,0,0,0,0,1
423,5,6,3,1,24.0,0,0,8.4583,0,1,0,0,0,0,0,0,0,0,0,1
424,6,7,1,1,54.0,0,0,51.8625,0,0,1,0,0,0,0,1,0,0,0,0
425,7,8,3,1,2.0,3,1,21.0750,0,0,1,0,0,0,0,0,0,0,0,1
426,8,9,3,-1,27.0,0,2,11.1333,0,0,1,0,0,0,0,0,0,0,0,1
427,9,10,2,-1,14.0,1,0,30.0708,1,0,0,0,0,0,0,0,0,0,0,1


### SVM

In [48]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


svm_model = SVC(C=0.0001, gamma="auto", kernel = "linear").fit(X_train, y_train)
cross_val_score(svm_model, X_train, y_train, cv = 5)
#print(confusion_matrix(y_train, svm_model.predict(X_train)))

array([0.60893855, 0.72625698, 0.65730337, 0.6741573 , 0.68361582])

In [None]:
y_preds = svm_model.predict(X_test)

In [29]:
preds_test = svm_model.predict(X_test)

submission = pd.concat([test['PassengerId'], pd.Series(preds_test).astype("int")], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission.head()
submission.to_csv("submission_svm.csv", index = False)

(891, 20)