In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_absolute_error, accuracy_score

from sklearn.preprocessing import StandardScaler

In [None]:
# PassengerIds from the test set
test_passengerIDs = pd.read_csv('../input/titanic/test.csv')['PassengerId']

# func to submit predictions easily
# (1) parameter, which is the series/dataframe of your predictions
def submit_predictions(predictions):
    submission = pd.DataFrame({'PassengerId': test_passengerIDs, 'Survived': predictions})
    submission.to_csv('submission.csv', index = False)

In [None]:
test_passengerIDs.head()

In [None]:
# These are the original Titanic competition datasets, no changes
train_orig = pd.read_csv('../input/titanic/train.csv')
test_orig = pd.read_csv('../input/titanic/test.csv')

In [None]:
train_orig.head()

In [None]:
test_orig.head()

# Random Forest

## Setup

In [None]:
train_random_forest = pd.read_csv('../input/teammates-titanic/train_random_forest.csv')
test_random_forest = pd.read_csv('../input/teammates-titanic/test_random_forest.csv')

In [None]:
train_random_forest.head()

In [None]:
test_random_forest.head()

## Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_random_forest = train_random_forest.drop('Survived', axis = 1)

In [None]:
y_random_forest = train_random_forest['Survived']

In [None]:
RFC = RandomForestClassifier(n_estimators = 30, criterion = 'gini', max_depth = 3, random_state = 0)

In [None]:
#using K-Fold Split
kf = KFold(n_splits = 3)
kf.get_n_splits(X_random_forest)

In [None]:
scores = []
for train_index, valid_index in kf.split(X_random_forest):
    X_train, X_test, y_train, y_test = X_random_forest.iloc[train_index], X_random_forest.iloc[valid_index], y_random_forest.iloc[train_index], y_random_forest.iloc[valid_index]
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    RFC.fit(X_train, y_train)
    predictions = RFC.predict(X_test)
    predictions = (predictions > 0.5).astype(int)
    print(predictions)
    scores.append((predictions==y_test).sum()/len(y_test))
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))

In [None]:
print(pd.Series(scores).describe())
print(pd.Series(scores))

## Predict

In [None]:
# Scale test data first
scaler = StandardScaler()
scaler.fit(test_random_forest)
test_random_forest_scaled = scaler.transform(test_random_forest)

print(test_random_forest_scaled)

In [None]:
pred_random_forest = RFC.predict(test_random_forest_scaled)

pred_random_forest

# SVM

## Setup

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
train_svm = pd.read_csv('../input/titanic-svm/train_svm.csv')
test_svm = pd.read_csv('../input/titanic-svm/test_svm.csv')

In [None]:
train_svm.head()

In [None]:
test_svm.head()

In [None]:
X_train_svm_columns = ['Age', 'Fare', 'Cabin', 'Title', 'Has_Cabin', 'Family_Size', 'Is_Alone', 'Age_bin', 'Fare_bin', 'Tick_Len', 'Pclass_Frequency', 'Embarked_Q', 'Embarked_S', 'Male']

## Training

In [None]:
from sklearn.preprocessing import scale

In [None]:
train_svm.columns != 'Survived'

In [None]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(train_svm[X_train_svm_columns], train_svm['Survived'], test_size = 0.33, random_state = 101)

In [None]:
# Important to scale the X data, 
# otherwise GridSearchCV, BayesOptCV,
# don't work well

X_train_svm_scaled = scale(X_train_svm)
X_test_svm_scaled = scale(X_test_svm)

In [None]:
from sklearn.svm import SVC

In [None]:
gridsearch_paramGrid = [
  {'C': [1, 10], 'kernel': ['linear']},
  {'C': [100, 1000, 10000], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']}
]

In [None]:
db_gridsearch_svc = GridSearchCV(SVC(), param_grid = gridsearch_paramGrid, scoring = 'accuracy', n_jobs = -1, verbose = 5)

In [None]:
# Will take 2-3 seconds
db_gridsearch_svc.fit(X_train_svm_scaled, y_train_svm)

In [None]:
# check the best parameters out of curiosity
db_gridsearch_svc.best_params_

In [None]:
# Cross validation on results
gridsearch_svc_pred = db_gridsearch_svc.predict(X_test_svm_scaled)

print(classification_report(y_test_svm, gridsearch_svc_pred)) 
print(confusion_matrix(y_test_svm, gridsearch_svc_pred))

## Predict

In [None]:
test_svm_scaled = scale(test_svm)
test_svm_scaled

In [None]:
pred_svc = db_gridsearch_svc.predict(test_svm_scaled)
pred_svc

# Neural Network

## Setup

In [None]:
# Imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn import tree
from matplotlib import pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.utils.vis_utils import plot_model
from keras.optimizers import SGD
from keras.optimizers import RMSprop
from keras.optimizers import Adagrad
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from numpy.random import randn
from keras.regularizers import l1
from keras.regularizers import l2
import tensorflow as tf

import pathlib
import shutil
import tempfile

from kerastuner.tuners import Hyperband

import kerastuner as kt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from keras.callbacks import EarlyStopping
from keras.layers import GaussianNoise

### I commented out the below imports because I was getting errors
# !pip install git+https://github.com/tensorflow/docs
# import tensorflow_docs as tfdocs
# import tensorflow_docs.modeling
# import tensorflow_docs.plots

# !pip install ann_visualizer
# from ann_visualizer.visualize import ann_viz

In [None]:
train_neural_network = pd.read_csv('../input/teammates-titanic/train_neural_network.csv')
test_neural_network = pd.read_csv('../input/teammates-titanic/test_neural_network.csv')

In [None]:
train_neural_network.head()

In [None]:
test_neural_network.head()

In [None]:
print("Train Shape: ", train_neural_network.shape)
print("Test Shape: ", test_neural_network.shape)

In [None]:
combined_train_test = [train_neural_network, test_neural_network]
for df in combined_train_test:
    df['Sex'].replace({'male': 0, 'female': 1}, inplace = True)
    df['Embarked'].replace({'S' : 0, 'C' : 1, 'Q' : 2}, inplace = True)

## Training

In [None]:
X_neural_network, y_neural_network = train_neural_network.loc[:, train_neural_network.columns != 'Survived'], train_neural_network['Survived']

print(X_neural_network)

print(len(X_neural_network) + len(y_neural_network))

In [None]:
X_train_neural_network, X_test_neural_network, y_train_neural_network, y_test_neural_network = train_test_split(X_neural_network, y_neural_network, test_size = 0.3, random_state = 7)

In [None]:
X_train_neural_network.shape, X_test_neural_network.shape

In [None]:
clf = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state = 0)

In [None]:
# Fit the model
clf.fit(X_train_neural_network, y_train_neural_network)

In [None]:
# Get various accuracies

print("Accuracy on training set: {:.3f}".format(clf.score(X_train_neural_network, y_train_neural_network)))
print("Accuracy on test set: {:.3f}".format(clf.score(X_test_neural_network, y_test_neural_network)))

In [None]:
pred_neural_network_2 = clf.predict(test_neural_network)
pred_neural_network_2 = (pred_neural_network_2 > 0.5).astype(int)
pred_neural_network_2 = pred_neural_network_2.flatten()

In [None]:
text_representation = tree.export_text(clf)
print(text_representation)

In [None]:
# Adding dropout layers to remove nodes to decrease the complexity
model = keras.Sequential()
model.add(GaussianNoise(0.01, input_shape=(13,)))
#model.add(Dense(13, input_dim=13, activation='relu', activity_regularizer=l2(1e-4))) # Hidden 1
model.add(Dense(13, input_dim=13, activation='relu', kernel_regularizer=l2(0.001)))
model.add(keras.layers.Dropout(0.3))
model.add(Dense(6, activation='relu', activity_regularizer=l2(1e-4))) # Hidden 2
model.add(keras.layers.Dropout(0.3))
#model.add(Dense(3, activation='relu', activity_regularizer=l2(1e-4))) # Hidden 3
model.add(Dense(3, activation='relu', kernel_regularizer=l2(0.001)))
model.add(keras.layers.Dropout(0.3))
model.add(Dense(1, activation='sigmoid')) # Output
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)
model.fit(X_train_neural_network, y_train_neural_network, epochs=100, batch_size=8)

#### What worked for reducing overfitting:
#### 1. Activity regularization (l2 seemed better)
#### 2. Weight regularization (l2 as well)
#### 3. Dropout layers
#### 4. Adding statistical noise during training

In [None]:
keras_scores = model.evaluate(X_test_neural_network,y_test_neural_network)

In [None]:
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
print(pd.Series(scores).describe())

In [None]:
y_pred_neural_network = model.predict(X_test_neural_network).reshape(-1)
print(y_pred_neural_network[:10])

In [None]:
# round to 0 or 1
y_pred_neural_network = np.round(y_pred_neural_network)
print(y_pred_neural_network[:10])

In [None]:
y_test_neural_network[:10]

In [None]:
print(classification_report(y_test_neural_network, y_pred_neural_network))

In [None]:
plot_model(model, to_file='titanic_model.png', show_shapes=True, show_layer_names=True)

## Predict

In [None]:
predictions_neural_network = model.predict(test_neural_network)
predictions_neural_network = (predictions_neural_network > 0.5).astype(int)
predictions_neural_network = predictions_neural_network.flatten()
print(predictions_neural_network)

# LightGBM

## Setup

In [None]:
from sklearn import metrics
import lightgbm as lgb # if you don't have it downloaded, do pip install lightgbm

In [None]:
train_lgbm = pd.read_csv('../input/teammates-titanic/train_lgbm.csv')
test_lgbm = pd.read_csv('../input/teammates-titanic/test_lgbm.csv')

In [None]:
train_lgbm.head()

In [None]:
test_lgbm.head()

In [None]:
X_lgbm = train_lgbm.drop('Survived',axis=1)
y_lgbm = train_lgbm['Survived']

In [None]:
cate_features_name = ["Pclass","Embarked","Title","Sex","Cabin","Has_Cabin","Is_Alone"]

## Training

### For Full Dataset Submission

Note, you'll need to create a new lgb.Dataset with new X and y values (will likely be X_train and Y_train) whenever you want to train for stacking or within a KFold.

In [None]:
params = {'boosting_type': 'gbdt',
          'max_depth' : -1,
          'objective': 'binary',
          'nthread': 3,
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'binary_error',
          'num_boost_rounds' : 87}

In [None]:
lgb_full_train = lgb.Dataset(X_lgbm, y_lgbm, categorical_feature=cate_features_name, free_raw_data=False)

submission_model_lgbm = lgb.train(
    params, lgb_full_train,
    valid_sets=None,
    verbose_eval=10,
)

## Predict

In [None]:
# converts decimals into 0 and 1 values.
pred_lgbm =  submission_model_lgbm.predict(test_lgbm)
pred_lgbm = (pred_lgbm > 0.5).astype(int)

In [None]:
pred_lgbm

# Logistic Regression

## Setup

In [None]:
train_logreg = pd.read_csv('../input/teammates-titanic/train_logreg.csv')
test_logreg = pd.read_csv('../input/teammates-titanic/test_logreg.csv')

In [None]:
train_logreg.head()

In [None]:
test_logreg.head()

In [None]:
# to deal with collinearity
train_logreg = train_logreg.drop('S', axis = 1)
test_logreg = test_logreg.drop('S', axis = 1)

## Training

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
X_logreg = train_logreg.drop('Survived', axis=1)
y_logreg = train_logreg['Survived']

In [None]:
# scale the X data
logreg_scaler = StandardScaler()
logreg_scaler.fit(X_logreg)
X_logreg_scaled = logreg_scaler.transform(X_logreg)

In [None]:
X_logreg_scaled

In [None]:
X_train_logreg, X_test_logreg, y_train_logreg, y_test_logreg = train_test_split(X_logreg_scaled,y_logreg,test_size = 0.30,random_state = 50)

In [None]:
logreg = LogisticRegression(random_state = 30)
logreg.fit(X_train_logreg,y_train_logreg)

In [None]:
# now crossvalidation
train_pred_logreg = logreg.predict(X_test_logreg)

In [None]:
print(confusion_matrix(y_test_logreg, train_pred_logreg))
print(classification_report(y_test_logreg, train_pred_logreg))

## Predict

In [None]:
# first scale the test data
logreg_scaler.fit(test_logreg)
test_logreg_scaled = logreg_scaler.transform(test_logreg)

In [None]:
test_logreg_scaled

In [None]:
pred_logreg = logreg.predict(test_logreg_scaled)

In [None]:
# Check that results are reasonable
pred_logreg

# Ensembling

> ## Weighted Average Manual

In [None]:
# default weights
random_forest_weight = 0.2
svc_weight = 0.2
neural_network_weight = 0.2
lgbm_weight = 0.2
logreg_weight = 0.2

In [None]:
# a prediction function using the weights
# (5) parameters: the respective weights
def ensemble_weight_avg(rf_weight, svc_weight, nn_weight, lgbm_weight, logreg_weight):
    
    # multiply by weights
    weighted_pred_rf = pred_random_forest * rf_weight
    weighted_pred_svc = pred_svc * svc_weight
    weighted_pred_nn = pred_neural_network_2 * nn_weight
    weighted_pred_lgbm = pred_lgbm * lgbm_weight
    weighted_pred_logreg = pred_logreg * logreg_weight
    
    all_weighted_predictions = [
        weighted_pred_rf, 
        weighted_pred_svc, 
        weighted_pred_nn, 
        weighted_pred_lgbm, 
        weighted_pred_logreg
    ]
    
    # add the predictions together
    weighted_pred = pd.Series([0] * len(test_orig))
    for pred in all_weighted_predictions:
        weighted_pred += pred
        
    # round to 0/1 and return
    weighted_pred = (weighted_pred > 0.5).astype(int)
    return weighted_pred

In [None]:
# a convenient function that calls ensemble_weight_avg
# using the five weights
def titanic_ensemble():
    print('Weights: ', random_forest_weight, svc_weight, neural_network_weight, lgbm_weight, logreg_weight)
    return ensemble_weight_avg(random_forest_weight, svc_weight, neural_network_weight, lgbm_weight, logreg_weight)

In [None]:
# test out the functions
weight_avg_pred = titanic_ensemble()

In [None]:
weight_avg_pred.head()

In [None]:
weight_avg_pred.tail()

In [None]:
submit_predictions(weight_avg_pred)

In [None]:
# muddle with the weights
random_forest_weight = 0.3
svc_weight = 0.2
neural_network_weight = 0.1
lgbm_weight = 0.3
logreg_weight = 0.1

In [None]:
weight_avg_pred2 = titanic_ensemble()

In [None]:
sns.countplot(x = weight_avg_pred2)

In [None]:
submit_predictions(weight_avg_pred2)

> ## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split 

In [None]:
X = train.drop('Survived', axis = 1)
y = train['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,  
                                                    y,  
                                                    test_size = 0.30,  
                                                    random_state = 42) 

In [None]:
estimator = [] 
estimator.append(('LR', logreg) 
estimator.append(('SVC', db_gridsearch_svc)) 
estimator.append(('NN', model))
estimator.append(('LGBM', submission_model_lgbm))
estimator.append(('RF', RFC))

In [None]:
# Voting Classifier with hard voting 
vot_hard = VotingClassifier(estimators = estimator, voting ='hard') 
vot_hard.fit(X_train, y_train) 
y_pred = vot_hard.predict(X_test)

In [None]:
# using accuracy_score metric to predict accuracy 
score = accuracy_score(y_test, y_pred) 
print("Hard Voting Score % d" % score) 

In [None]:
# Voting Classifier with soft voting 
vot_soft = VotingClassifier(estimators = estimator, voting ='soft') 
vot_soft.fit(X_train, y_train) 
y_pred = vot_soft.predict(X_test) 

In [None]:
# using accuracy_score 
score = accuracy_score(y_test, y_pred) 
print("Soft Voting Score % d" % score) 