In [30]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [6]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
def get_performance(predictions, y_test, labels=[1, 0]):
    # Put your code
    accuracy = accuracy_score(y_test, predictions)
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predictions)
    precision = precision[1]
    recall = recall[1]
    f1_score = f1_score[1]
    
    report = classification_report(y_test, predictions)
    
    cm = confusion_matrix(y_test, predictions)  # replace
    cm_as_dataframe = pd.DataFrame(data=cm)
    
    print('Model Performance metrics:')
    print('-'*30)
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1_score)
    print('\nModel Classification report:')
    print('-'*30)
    print(report)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    print(cm_as_dataframe)
    
    return accuracy, precision, recall, f1_score

In [8]:
X_train=pd.read_csv('X_train.csv')
X_test=pd.read_csv('X_test.csv')
y_train=pd.read_csv('y_train.csv')
y_test=pd.read_csv('y_test.csv')

In [9]:
X_train=X_train.drop(columns=['Unnamed: 0','PassengerId'],axis=1)
X_test=X_test.drop(columns=['Unnamed: 0','PassengerId'],axis=1)

In [14]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1.0,28.0,0,0,7.8958
1,3,0.0,17.0,4,2,7.9250
2,3,1.0,30.0,1,0,16.1000
3,3,1.0,22.0,0,0,7.2500
4,2,0.0,45.0,0,0,13.5000
...,...,...,...,...,...,...
663,1,0.0,39.0,1,1,83.1583
664,3,0.0,19.0,1,0,7.8542
665,3,1.0,28.0,0,0,7.7333
666,3,0.0,36.0,1,0,17.4000


In [10]:
y_train=y_train.drop(columns=['Unnamed: 0'],axis=1)
y_test=y_test.drop(columns=['Unnamed: 0'],axis=1)

#### Baseline

In [97]:
dtc=DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=50, min_samples_split=2, min_samples_leaf=1)

In [98]:
dtc.fit(X_train,y_train['Survived'])

In [99]:
y_test_pred=dtc.predict(X_test)

In [100]:
y_test_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1])

In [101]:
y_test['Survived']

0      0
1      0
2      0
3      1
4      1
      ..
218    0
219    1
220    0
221    1
222    1
Name: Survived, Length: 223, dtype: int64

In [102]:
accuracy_score(y_test['Survived'],y_test_pred)

0.7623318385650224

In [103]:
sum(y_test['Survived']==y_test_pred)/len(y_test)

0.7623318385650224

In [104]:
confusion_matrix(y_test['Survived'],y_test_pred)

array([[114,  25],
       [ 28,  56]])

In [105]:
get_performance(y_test_pred, y_test['Survived'])

Model Performance metrics:
------------------------------
Accuracy: 0.7623318385650224
Precision: 0.691358024691358
Recall: 0.6666666666666666
F1 Score: 0.6787878787878788

Model Classification report:
------------------------------
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       139
           1       0.69      0.67      0.68        84

    accuracy                           0.76       223
   macro avg       0.75      0.74      0.75       223
weighted avg       0.76      0.76      0.76       223


Prediction Confusion Matrix:
------------------------------
     0   1
0  114  25
1   28  56


(0.7623318385650224, 0.691358024691358, 0.6666666666666666, 0.6787878787878788)

### Random Forest

In [131]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2,3,5],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 250]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)


In [132]:
grid_search.fit(X_train,y_train['Survived'])

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.3s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.3s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=250; total time=   0.5s
[CV] 

In [133]:
grid_search.best_estimator_

In [134]:
y_test_pred_rf=grid_search.predict(X_test)

In [135]:
accuracy_score(y_test['Survived'],y_test_pred_rf)

0.8475336322869955

In [111]:
confusion_matrix(y_test['Survived'],y_test_pred_rf)

array([[130,   9],
       [ 24,  60]])

In [112]:
get_performance(y_test_pred_rf, y_test['Survived'])

Model Performance metrics:
------------------------------
Accuracy: 0.852017937219731
Precision: 0.8695652173913043
Recall: 0.7142857142857143
F1 Score: 0.7843137254901961

Model Classification report:
------------------------------
              precision    recall  f1-score   support

           0       0.84      0.94      0.89       139
           1       0.87      0.71      0.78        84

    accuracy                           0.85       223
   macro avg       0.86      0.82      0.84       223
weighted avg       0.85      0.85      0.85       223


Prediction Confusion Matrix:
------------------------------
     0   1
0  130   9
1   24  60


(0.852017937219731, 0.8695652173913043, 0.7142857142857143, 0.7843137254901961)

### XGboost

In [39]:

param_dist = {
    'n_estimators':[5,6,7,8],
    'max_depth':[30,40,50], 
    'learning_rate':[0.2,0.25,0.3],
    }
#Create the lgbm moodel object
bst = XGBClassifier(objective='binary:logistic')
#Create the GridSearch object
bst_grid = GridSearchCV(bst, param_dist,cv = 5, n_jobs = -1, verbose = 2)

In [40]:

bst_grid.fit(X_train,y_train['Survived'])

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.0s[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.0s

[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=7; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n

In [41]:
bst_grid.best_estimator_

In [42]:
y_test_pred_lgbm=bst_grid.predict(X_test)
get_performance(y_test_pred_lgbm, y_test['Survived'])

### NN

In [70]:
model = keras.Sequential()
model.add(layers.Dense(8, input_dim=6, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 8)                 56        
                                                                 
 dense_36 (Dense)            (None, 128)               1152      
                                                                 
 dense_37 (Dense)            (None, 128)               16512     
                                                                 
 dense_38 (Dense)            (None, 2)                 258       
                                                                 
Total params: 17,978
Trainable params: 17,978
Non-trainable params: 0
_________________________________________________________________


In [76]:
opt = keras.optimizers.Adam(learning_rate=0.001)

In [77]:
model.compile(
  loss='sparse_categorical_crossentropy', 
  optimizer=opt,
  metrics=['accuracy'],
)

In [78]:
cp2 = keras.callbacks.ModelCheckpoint('/home/manuelquiros/Documents/Kaggle/Titanic/MLP/', save_best_only=True)
log_dir2 = "MLP/logs/fit/"
tb2 = keras.callbacks.TensorBoard(log_dir=log_dir2, histogram_freq=1)
es2 = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=8)

In [79]:
history = model.fit(
  X_train, 
  y_train['Survived'], 
  epochs=50, 
  batch_size=16, 
  validation_split=0.2,
  callbacks=[cp2,tb2]
)

Epoch 1/50
 1/34 [..............................] - ETA: 7s - loss: 0.3141 - accuracy: 0.8125INFO:tensorflow:Assets written to: /home/manuelquiros/Documents/Kaggle/Titanic/MLP/assets
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
 1/34 [..............................] - ETA: 0s - loss: 0.2792 - accuracy: 0.8750INFO:tensorflow:Assets written to: /home/manuelquiros/Documents/Kaggle/Titanic/MLP/assets
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [80]:
model = keras.models.load_model('MLP/')

In [32]:
y_test_pred_lgbm

array([[9.42068338e-01, 5.79316467e-02],
       [8.35553765e-01, 1.64446115e-01],
       [9.67601836e-01, 3.23981084e-02],
       [2.59260565e-01, 7.40739346e-01],
       [3.21395636e-01, 6.78604305e-01],
       [7.43769348e-01, 2.56230503e-01],
       [5.94936050e-02, 9.40506339e-01],
       [3.99173796e-01, 6.00826204e-01],
       [8.75969648e-01, 1.24030314e-01],
       [5.01540899e-01, 4.98459160e-01],
       [8.78005981e-01, 1.21993951e-01],
       [6.52772114e-02, 9.34722781e-01],
       [8.39091420e-01, 1.60908639e-01],
       [2.27613449e-02, 9.77238655e-01],
       [1.55775696e-02, 9.84422445e-01],
       [3.88069510e-01, 6.11930430e-01],
       [8.96985352e-01, 1.03014670e-01],
       [9.05854940e-01, 9.41450596e-02],
       [8.52753818e-01, 1.47246137e-01],
       [2.46301388e-05, 9.99975264e-01],
       [9.07479942e-01, 9.25200284e-02],
       [2.94443220e-01, 7.05556810e-01],
       [8.38872135e-01, 1.61127880e-01],
       [7.28110433e-01, 2.71889597e-01],
       [3.271340

In [31]:
y_test_pred_lgbm=model.predict(X_test)
get_performance(y_test_pred_lgbm, y_test['Survived'])



ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

### Validation predictions

In [44]:
#change the model in the predict line
X_val = pd.read_csv('X_val.csv')
X_val_pred = X_val.drop(columns=['Unnamed: 0','PassengerId'],axis=1)
y_val_pred_rf=bst_grid.predict(X_val_pred)
X_val['Survived']=y_val_pred_rf
output=X_val.drop(columns=['Unnamed: 0',  'Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare'],axis=1)
output.to_csv('output.csv',index=False)