In [26]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
def get_performance(predictions, y_test, labels=[1, 0]):
    # Put your code
    accuracy = accuracy_score(y_test, predictions)
    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, predictions)
    precision = precision[1]
    recall = recall[1]
    f1_score = f1_score[1]
    
    report = classification_report(y_test, predictions)
    
    cm = confusion_matrix(y_test, predictions)  # replace
    cm_as_dataframe = pd.DataFrame(data=cm)
    
    print('Model Performance metrics:')
    print('-'*30)
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1_score)
    print('\nModel Classification report:')
    print('-'*30)
    print(report)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    print(cm_as_dataframe)
    
    return accuracy, precision, recall, f1_score

In [4]:
X_train=pd.read_csv('X_train.csv')
X_test=pd.read_csv('X_test.csv')
y_train=pd.read_csv('y_train.csv')
y_test=pd.read_csv('y_test.csv')

In [5]:
X_train=X_train.drop(columns=['Unnamed: 0','PassengerId'],axis=1)
X_test=X_test.drop(columns=['Unnamed: 0','PassengerId'],axis=1)

In [6]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,1.0,28.0,0,0,7.8958
1,3,0.0,17.0,4,2,7.9250
2,3,1.0,30.0,1,0,16.1000
3,3,1.0,22.0,0,0,7.2500
4,2,0.0,45.0,0,0,13.5000
...,...,...,...,...,...,...
663,1,0.0,39.0,1,1,83.1583
664,3,0.0,19.0,1,0,7.8542
665,3,1.0,28.0,0,0,7.7333
666,3,0.0,36.0,1,0,17.4000


In [7]:
y_train=y_train.drop(columns=['Unnamed: 0'],axis=1)
y_test=y_test.drop(columns=['Unnamed: 0'],axis=1)

#### Baseline

In [8]:
dtc=DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=50, min_samples_split=2, min_samples_leaf=1)

In [9]:
dtc.fit(X_train,y_train['Survived'])

DecisionTreeClassifier(max_depth=50)

In [10]:
y_test_pred=dtc.predict(X_test)

In [11]:
y_test_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1])

In [12]:
y_test['Survived']

0      0
1      0
2      0
3      1
4      1
      ..
218    0
219    1
220    0
221    1
222    1
Name: Survived, Length: 223, dtype: int64

In [13]:
accuracy_score(y_test['Survived'],y_test_pred)

0.7757847533632287

In [14]:
sum(y_test['Survived']==y_test_pred)/len(y_test)

0.7757847533632287

In [15]:
confusion_matrix(y_test['Survived'],y_test_pred)

array([[113,  26],
       [ 24,  60]])

In [16]:
get_performance(y_test_pred, y_test['Survived'])

Model Performance metrics:
------------------------------
Accuracy: 0.7757847533632287
Precision: 0.6976744186046512
Recall: 0.7142857142857143
F1 Score: 0.7058823529411765

Model Classification report:
------------------------------
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       139
           1       0.70      0.71      0.71        84

    accuracy                           0.78       223
   macro avg       0.76      0.76      0.76       223
weighted avg       0.78      0.78      0.78       223


Prediction Confusion Matrix:
------------------------------
     0   1
0  113  26
1   24  60


(0.7757847533632287,
 0.6976744186046512,
 0.7142857142857143,
 0.7058823529411765)

### Random Forest

In [17]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2,3,5],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 250]
}
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)


In [None]:
grid_search.fit(X_train,y_train['Survived'])

In [19]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=110, max_features=2, min_samples_leaf=3,
                       min_samples_split=8)

In [20]:
y_test_pred_rf=grid_search.predict(X_test)

In [21]:
accuracy_score(y_test['Survived'],y_test_pred_rf)

0.852017937219731

In [22]:
confusion_matrix(y_test['Survived'],y_test_pred_rf)

array([[128,  11],
       [ 22,  62]])

In [23]:
get_performance(y_test_pred_rf, y_test['Survived'])

Model Performance metrics:
------------------------------
Accuracy: 0.852017937219731
Precision: 0.8493150684931506
Recall: 0.7380952380952381
F1 Score: 0.7898089171974523

Model Classification report:
------------------------------
              precision    recall  f1-score   support

           0       0.85      0.92      0.89       139
           1       0.85      0.74      0.79        84

    accuracy                           0.85       223
   macro avg       0.85      0.83      0.84       223
weighted avg       0.85      0.85      0.85       223


Prediction Confusion Matrix:
------------------------------
     0   1
0  128  11
1   22  62


(0.852017937219731, 0.8493150684931506, 0.7380952380952381, 0.7898089171974523)

### XGboost

In [27]:

param_dist = {
    'n_estimators':[5,6,7,8],
    'max_depth':[30,40,50], 
    'learning_rate':[0.2,0.25,0.3],
    }
#Create the lgbm moodel object
bst = XGBClassifier(objective='binary:logistic')
#Create the GridSearch object
bst_grid = GridSearchCV(bst, param_dist,cv = 5, n_jobs = -1, verbose = 2)

In [28]:

bst_grid.fit(X_train,y_train['Survived'])

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.2s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.2s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.2s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=5; total time=   0.1s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.2s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.0s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.2s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=6; total time=   0.1s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=7; total time=   0.1s
[CV] END ....learning_rate=0.2, max_depth=30, n_estimators=7; total time=   0.1s
[CV] END ....learning_rate=0.2, max_depth=30, n

GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=Non

In [None]:
bst_grid.best_estimator_

In [None]:
y_test_pred_lgbm=bst_grid.predict(X_test)
get_performance(y_test_pred_lgbm, y_test['Survived'])

### NN

In [None]:
model = keras.Sequential()
model.add(layers.Dense(8, input_dim=6, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))

model.summary()

In [None]:
opt = keras.optimizers.Adam(learning_rate=0.001)

In [None]:
model.compile(
  loss='sparse_categorical_crossentropy', 
  optimizer=opt,
  metrics=['accuracy'],
)

In [None]:
cp2 = keras.callbacks.ModelCheckpoint('/home/manuelquiros/Documents/Kaggle/Titanic/MLP/', save_best_only=True)
log_dir2 = "MLP/logs/fit/"
tb2 = keras.callbacks.TensorBoard(log_dir=log_dir2, histogram_freq=1)
es2 = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=8)

In [None]:
history = model.fit(
  X_train, 
  y_train['Survived'], 
  epochs=50, 
  batch_size=16, 
  validation_split=0.2,
  callbacks=[cp2,tb2]
)

In [None]:
model = keras.models.load_model('MLP/')

In [None]:
y_test_pred_lgbm

In [None]:
y_test_pred_lgbm=model.predict(X_test)
get_performance(y_test_pred_lgbm, y_test['Survived'])

### Validation predictions

In [None]:
#change the model in the predict line
X_val = pd.read_csv('X_val.csv')
X_val_pred = X_val.drop(columns=['Unnamed: 0','PassengerId'],axis=1)
y_val_pred_rf=bst_grid.predict(X_val_pred)
X_val['Survived']=y_val_pred_rf
output=X_val.drop(columns=['Unnamed: 0',  'Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare'],axis=1)
output.to_csv('output.csv',index=False)