# Titanic prediction - competition

In [1]:
import numpy as np
import pandas as pd

# Set seed to always be used
seed = 123


def preprocess(train, test):
    train['test_train'] = 'train'
    test['test_train'] = 'test'
    data = pd.concat([test,train], sort = True)
    data['Cabin_dep'] = [cabin_no[0] for cabin_no in data['Cabin'].astype("str") if cabin_no.lower() != "nan"]
    data['Cabin_dep'] = data['Cabin_dep'].astype('category')
    
    data = data.drop(['Cabin', 'Name',], axis = 1)
    
    ### Fix the age variable
    
    from matplotlib.pyplot import hist
    import matplotlib.pyplot as plt

    from scipy.stats import gaussian_kde
    hist(data['Age'].dropna(), density = True)

    from scipy.stats import poisson

    dens = gaussian_kde(data['Age'].dropna())
    x = np.arange(0,data['Age'].dropna().max())
    plt.plot(x, dens.evaluate(x), 'r')

    dist = dens.evaluate(x)
    # Normalize

    dist = np.divide(dist,np.sum(dist))

    # We should sample out of this distribution to compensate. 
    np.random.seed(seed)
    nan_ages = np.random.choice(x, p = dist, size = data['Age'].isnull().sum(), )

    count = 0
    for i in range(data.shape[0]):
        if np.isnan(data['Age'][i]):
            data['Age'][i] = nan_ages[count]
            count += 1
            
    data['Cabin'] = data['Cabin'].fillna("Unknown")
    
    
    # Done processing. Return it
    test, train = [x for _, x in data.groupby(data['train_test'] == "train")]
    test.drop(['train_test'],axis=1)
    train.drop(['train_test'],axis=1)
    # return data
    
    return train, test
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data = pd.concat([test,train], sort = False)
data = data.reset_index()
#train, test = preprocess(train, test)


In [2]:
print(data['Cabin'].isnull().sum())

1014


In [3]:
data['Cabin_dep'] = [cabin_no[0] for cabin_no in data['Cabin'].astype("str")]
#data['Cabin_dep'] = [None for cabin_no in data['Cabin'].astype("str")]
data['Cabin_dep'] = data['Cabin_dep'].astype("category")

## Fixing age variable

As we saw, we have many ages that are NaN. To compensate for this, we simulate the distribution and obtain new samples. Make sure to use seed.  

In [4]:
from matplotlib.pyplot import hist
import matplotlib.pyplot as plt

from scipy.stats import gaussian_kde
hist(data['Age'].dropna(), density = True)

from scipy.stats import poisson

dens = gaussian_kde(data['Age'].dropna())
x = np.arange(0,data['Age'].dropna().max())
plt.plot(x, dens.evaluate(x), 'r')

dist = dens.evaluate(x)
# Normalize

dist = np.divide(dist,np.sum(dist))

# We should sample out of this distribution to compensate. 
np.random.seed(seed)
nan_ages = np.random.choice(x, p = dist, size = data['Age'].isnull().sum(), )

count = 0
#print(data)
for i in range(data.shape[0]):
    if np.isnan(data['Age'][i]):
        data['Age'][i] = nan_ages[count]
        count += 1
        
print(data['Age'].isnull().sum())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0


In [5]:
data.isnull().sum()

index             0
PassengerId       0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Survived        418
Cabin_dep         0
dtype: int64

Okay, so we need to fix Fare as well. Just take the mean here. 

In [6]:
data['Fare'] = data['Fare'].fillna(data['Fare'].dropna().mean())

In [7]:


data['Sex'] = data['Sex'].astype('str')
data['Sex'] = data['Sex'].replace({'male':1, 'female':-1})

# Drop unnecessary frames not interesting. 

data = data.drop(['Name','Ticket','Cabin'], axis = 1)

In [8]:
train, test = [x for _, x in data.groupby(data['Survived'].isnull())]
print(train.head())
test.head()

     index  PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare Embarked  \
418      0            1       3    1  22.0      1      0   7.2500        S   
419      1            2       1   -1  38.0      1      0  71.2833        C   
420      2            3       3   -1  26.0      0      0   7.9250        S   
421      3            4       1   -1  35.0      1      0  53.1000        S   
422      4            5       3    1  35.0      0      0   8.0500        S   

     Survived Cabin_dep  
418       0.0         n  
419       1.0         C  
420       1.0         n  
421       1.0         C  
422       0.0         n  


Unnamed: 0,index,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived,Cabin_dep
0,0,892,3,1,34.5,0,0,7.8292,Q,,n
1,1,893,3,-1,47.0,1,0,7.0,S,,n
2,2,894,2,1,62.0,0,0,9.6875,Q,,n
3,3,895,3,1,27.0,0,0,8.6625,S,,n
4,4,896,3,-1,22.0,1,1,12.2875,S,,n


## Data processing done - time for modelling

First off, we simply try a ridge regression with an rbf kernel and see its performance. This is kind of unchristly as it is actually regression, but it might actually perform well. 

In [9]:
from sklearn.kernel_ridge import KernelRidge

kern_ridge = KernelRidge(alpha=1.0, kernel = 'rbf')

X = train.drop(['index', 'PassengerId','Survived', 'Embarked', 'Cabin_dep'], axis = 1)
y = train['Survived']

print(kern_ridge.get_params())

params = {'alpha': [0.01,0.1, 1.0],
 'coef0': [0,0.1,1],
 'degree': [1,2,3],
 'gamma': [0.01,1,10],
 'kernel': ['rbf']
}

from sklearn.grid_search import GridSearchCV

grid_search = GridSearchCV(estimator=kern_ridge, param_grid = params, n_jobs = 1, iid=False,verbose=True, scoring = 'neg_mean_squared_error', cv = 5)
grid_search.fit(X,y)
print("Best params")
print(grid_search.best_params_)


{'alpha': 1.0, 'coef0': 1, 'degree': 3, 'gamma': None, 'kernel': 'rbf', 'kernel_params': None}
Fitting 5 folds for each of 81 candidates, totalling 405 fits




Best params
{'alpha': 0.1, 'coef0': 0, 'degree': 1, 'gamma': 0.01, 'kernel': 'rbf'}


[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:   20.9s finished


Now, use the best hyperparameters obtained. 

In [10]:
from sklearn.model_selection import cross_val_score
best_model = KernelRidge(alpha=0.1, coef0=0, degree=1, gamma=0.01, kernel = 'rbf')

In [11]:
best_model.fit(X,y)

y_preds = best_model.predict(X)
preds = []
for i in range(len(y_preds)):
    if y_preds[i] < 0.5:
        preds.append(0)
    else:
        preds.append(1)

from sklearn.metrics import confusion_matrix

confusion_matrix(y,preds)



array([[505,  44],
       [ 55, 287]])

Pretty good. Now lets try logistic regression. 

In [12]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression()

print(log_reg_model.get_params())
X = train.drop(['index', 'PassengerId','Survived', 'Embarked', 'Cabin_dep'], axis = 1)
y = y.astype("category")
log_reg_params = {'C': [0.1,1.0,10,100], 
                  'class_weight': [None], 
                  'dual': [None], 
                  'fit_intercept': [True,False], 
                  'max_iter': [10000], 
                  'multi_class': ['ovr'], 
                  'n_jobs': [1],
                  'penalty': ['l1','l2'],
                  'random_state': [123],  
                  'tol': [0.0001, 0.0005,0.001], 
                  'solver':['saga'],
                  'warm_start': [False]}

grid_search = GridSearchCV(estimator=log_reg_model, param_grid = log_reg_params, n_jobs = 1, iid=False,verbose=True, cv = 5)
grid_search.fit(X,y)


{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': 1, 'penalty': 'l2', 'random_state': None, 'solver': 'liblinear', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  1.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=1,
       param_grid={'C': [0.1, 1.0, 10, 100], 'class_weight': [None], 'dual': [None], 'fit_intercept': [True, False], 'max_iter': [10000], 'multi_class': ['ovr'], 'n_jobs': [1], 'penalty': ['l1', 'l2'], 'random_state': [123], 'tol': [0.0001, 0.0005, 0.001], 'solver': ['saga'], 'warm_start': [False]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=True)

In [13]:
print(grid_search.best_params_)

{'C': 10, 'class_weight': None, 'dual': None, 'fit_intercept': False, 'max_iter': 10000, 'multi_class': 'ovr', 'n_jobs': 1, 'penalty': 'l1', 'random_state': 123, 'solver': 'saga', 'tol': 0.0005, 'warm_start': False}


In [14]:
best_log_reg = LogisticRegression(C=10, fit_intercept=False,max_iter=10000, multi_class="ovr",n_jobs=1,penalty="l1", random_state=123, solver="saga", tol=0.0005, warm_start=False)

best_log_reg.fit(X,y)
log_reg_preds = best_log_reg.predict(X)
confusion_matrix(y, log_reg_preds)


array([[475,  74],
       [108, 234]])

So Logistic regression performed worse. Let's choose the Ridge Regression for the classification. 

In [19]:
X_test = test.drop(['index', 'PassengerId','Survived', 'Embarked', 'Cabin_dep'], axis = 1)
X_test.head()
y_preds_test = best_model.predict(X_test)
preds_test = []
for i in range(len(y_preds_test)):
    if y_preds_test[i] < 0.5:
        preds_test.append(0)
    else:
        preds_test.append(1)

In [33]:
submission = pd.concat([test['PassengerId'], pd.Series(preds_test)], axis=1)
submission.columns = ['PassengerId', 'Survived']
submission.head()
submission.to_csv("submission.csv", index = False)