## Imports

In [1]:
# pip install --upgrade scikit-learn

In [1]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.2.4
Sklearn  0.24.1


In [2]:
!pwd


/home/aimwps/PyCode/SAI/SAImwps/M4_Feature_Eng/01. Robust ML


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [3]:
CLOUD = False

if CLOUD:
    import os
    os.environ['KAGGLE_USERNAME'] = "your_kaggle_username"
    os.environ['KAGGLE_KEY']      = "your_kaggle_api_key"  # See https://www.kaggle.com/docs/api
    !pip install --upgrade kaggle
    !kaggle competitions download -c titanic
    DATA_PATH = "./"

else:
    DATA_PATH = "../../Datasets/Tabular/titanic/"



## Load data

In [4]:
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


## Check missings

In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [6]:
df_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

# Exercise 1 (2pts):
Extract the title (Mr, Mrs, ... ) from the "Name" column.

Tips:
- split(',')[1] to get the 2nd part, and remove the surnamename
- split('.')[0] to get the 1str part, and remove the name

In [7]:
# CODE HERE get_Title_from_Name funtion
# Create this function using lambda (not def)

get_Title_from_Name = lambda x: x.split(",")[1].split(".")[0].strip()

# YOUR CODE HERE
df['Title']      = df['Name'].map(get_Title_from_Name)
df_test['Title'] = df_test['Name'].map(get_Title_from_Name)

print(df.Title.unique())
print(df_test.Title.unique())

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'the Countess' 'Jonkheer']
['Mr' 'Mrs' 'Miss' 'Master' 'Ms' 'Col' 'Rev' 'Dr' 'Dona']


In [8]:
assert df['Title'].values[0] == "Mr"
assert df['Title'].values[1] == "Mrs"
assert df['Title'].values[2] == "Miss"

assert df_test['Title'].values[0] == "Mr"
assert df_test['Title'].values[1] == "Mrs"
assert df_test['Title'].values[414] == "Dona"

# Exercise 2 (1pts):
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [9]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Dona": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [10]:
# Use map to apply the prevous dict

df["Title"] =  df['Title'].map(title_dictionary)
print(df.Title.unique())
df_test["Title"] = df_test['Title'].map(title_dictionary, na_action='ignore')
print(df_test.Title.unique())
# YOUR CODE HERE

['Mr' 'Mrs' 'Miss' 'Master' 'Royalty' 'Officer']
['Mr' 'Mrs' 'Miss' 'Master' 'Officer' 'Royalty']


In [11]:
assert df['Title'].values[886] == "Officer"
assert df_test['Title'].values[417] == "Master"

# Exercise OPTINAL (0pts):
Try to extract some information from the feature **Ticket**. Search on Internet if that colum has some kind of information.

In [12]:
df_test['Ticket']

PassengerId
892                 330911
893                 363272
894                 240276
895                 315154
896                3101298
               ...        
1305             A.5. 3236
1306              PC 17758
1307    SOTON/O.Q. 3101262
1308                359309
1309                  2668
Name: Ticket, Length: 418, dtype: object

# Exercise OPTIONAL (0pts):
Try to extract some information from the feature **Cabin**. Search on Internet if that colum has some kind of information.

In [41]:
get_deck_from_cabin = lambda x: x[0] if isinstance(x, str) else "Z"  
df['Cabin'] = df['Cabin'].map(get_deck_from_cabin)
df_test['Cabin'] = df_test['Cabin'].map(get_deck_from_cabin)
print(df.head())

             Survived  Pclass  \
PassengerId                     
1                   0       3   
2                   1       1   
3                   1       3   
4                   1       1   
5                   0       3   

                                                          Name     Sex   Age  \
PassengerId                                                                    
1                                      Braund, Mr. Owen Harris    male  22.0   
2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
3                                       Heikkinen, Miss. Laina  female  26.0   
4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
5                                     Allen, Mr. William Henry    male  35.0   

             SibSp  Parch            Ticket     Fare Cabin Embarked Title  
PassengerId                                                                
1                1      0         A/5 21171   7.2500   

# Preprocessing
For X data, notice that...
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [59]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket']) # # X_TEST DATA (NEW DATA)

In [60]:
cat_vars  = ['Sex', 'Embarked', 'Title',]         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


# Exercise 3 (2pts):
Create a **ColumnTransformer for Tree Models**. You need to create 2 pipelines (one for numerical and other for categories). Remember:
- Categorical pipeline: Some SimpleImputer -> Some Encoder
- Numerical pipeline: Some SimpleImputer -> NO Encoder

In [61]:
"""
num_preprocessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
])

cat_preporcessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
  # Some Encoder here. Remember to handle_unknown
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro
""";

# YOUR CODE HERE
num_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='mean', add_indicator=False))]) # mean, median

cat_4_treeModels = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', preprocessing.OrdinalEncoder(handle_unknown='ignore'))])
tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_4_treeModels, num_vars),
    ('cat', cat_4_treeModels, cat_vars),],
    remainder='drop')
                                     
tree_prepro

In [62]:
assert type(tree_prepro)      is compose._column_transformer.ColumnTransformer
assert type(num_4_treeModels) is pipeline.Pipeline
assert type(cat_4_treeModels) is pipeline.Pipeline
assert len(num_4_treeModels) == 1
assert len(cat_4_treeModels) == 2

# Exercise 4 (1pts):
1. Complete the diccionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [63]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

In [64]:

tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees":ExtraTreesClassifier(),
  "Random Forest":RandomForestClassifier(),
  "AdaBoost":AdaBoostClassifier(),
  "Skl GBM":GradientBoostingClassifier(),
  "Skl HistGBM":HistGradientBoostingClassifier(),
  "XGBoost":XGBClassifier(use_label_encoder=False),
  "LightGBM":LGBMClassifier(),
  "CatBoost":CatBoostClassifier()}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

tree_classifiers['LightGBM']

In [65]:
for pipe in tree_classifiers.values():
    assert type(pipe) is pipeline.Pipeline

# Exercise 5 (3pts):
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [66]:
"""
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    # CODE HERE
)
"""
# YOUR CODE HERE
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    x, y,
    test_size = 0.2,
    stratify=y,
    random_state=909)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})


for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    model.fit(x_train, y_train)
    # GET PREDICTIONS USING x_val
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

Learning rate set to 0.008911
0:	learn: 0.6898757	total: 526us	remaining: 526ms
1:	learn: 0.6835937	total: 2.4ms	remaining: 1.2s
2:	learn: 0.6783447	total: 3.42ms	remaining: 1.14s
3:	learn: 0.6728893	total: 12.6ms	remaining: 3.13s
4:	learn: 0.6676779	total: 14ms	remaining: 2.78s
5:	learn: 0.6618515	total: 17.8ms	remaining: 2.95s
6:	learn: 0.6574433	total: 18.5ms	remaining: 2.63s
7:	learn: 0.6522946	total: 31.2ms	remaining: 3.87s
8:	learn: 0.6464726	total: 32.2ms	remaining: 3.54s
9:	learn: 0.6410550	total: 33.8ms	remaining: 3.34s
10:	learn: 0.6360904	total: 35.3ms	remaining: 3.17s
11:	learn: 0.6309657	total: 42.7ms	remaining: 3.52s
12:	learn: 0.6265644	total: 45.6ms	remaining: 3.46s
13:	learn: 0.6216783	total: 50.1ms	remaining: 3.52s
14:	learn: 0.6169639	total: 51.5ms	remaining: 3.38s
15:	learn: 0.6123690	total: 61.1ms	remaining: 3.75s
16:	learn: 0.6083484	total: 62ms	remaining: 3.58s
17:	learn: 0.6039543	total: 62.9ms	remaining: 3.43s
18:	learn: 0.5998178	total: 63.6ms	remaining: 3.29s

263:	learn: 0.3674850	total: 353ms	remaining: 985ms
264:	learn: 0.3673103	total: 355ms	remaining: 984ms
265:	learn: 0.3671306	total: 356ms	remaining: 982ms
266:	learn: 0.3669292	total: 357ms	remaining: 980ms
267:	learn: 0.3666534	total: 358ms	remaining: 978ms
268:	learn: 0.3664010	total: 359ms	remaining: 976ms
269:	learn: 0.3662714	total: 360ms	remaining: 974ms
270:	learn: 0.3660865	total: 361ms	remaining: 972ms
271:	learn: 0.3659159	total: 362ms	remaining: 970ms
272:	learn: 0.3654379	total: 363ms	remaining: 967ms
273:	learn: 0.3653135	total: 364ms	remaining: 965ms
274:	learn: 0.3651544	total: 365ms	remaining: 963ms
275:	learn: 0.3650336	total: 366ms	remaining: 961ms
276:	learn: 0.3647827	total: 367ms	remaining: 959ms
277:	learn: 0.3646322	total: 368ms	remaining: 957ms
278:	learn: 0.3643226	total: 370ms	remaining: 955ms
279:	learn: 0.3639520	total: 371ms	remaining: 953ms
280:	learn: 0.3639365	total: 371ms	remaining: 950ms
281:	learn: 0.3638923	total: 372ms	remaining: 947ms
282:	learn: 

599:	learn: 0.3178210	total: 719ms	remaining: 479ms
600:	learn: 0.3176357	total: 720ms	remaining: 478ms
601:	learn: 0.3175173	total: 720ms	remaining: 476ms
602:	learn: 0.3172662	total: 721ms	remaining: 475ms
603:	learn: 0.3171527	total: 723ms	remaining: 474ms
604:	learn: 0.3170647	total: 723ms	remaining: 472ms
605:	learn: 0.3169522	total: 724ms	remaining: 471ms
606:	learn: 0.3168784	total: 725ms	remaining: 469ms
607:	learn: 0.3167762	total: 726ms	remaining: 468ms
608:	learn: 0.3167618	total: 727ms	remaining: 467ms
609:	learn: 0.3166532	total: 727ms	remaining: 465ms
610:	learn: 0.3164216	total: 728ms	remaining: 464ms
611:	learn: 0.3161978	total: 729ms	remaining: 462ms
612:	learn: 0.3160123	total: 730ms	remaining: 461ms
613:	learn: 0.3159139	total: 731ms	remaining: 459ms
614:	learn: 0.3158685	total: 732ms	remaining: 458ms
615:	learn: 0.3155857	total: 733ms	remaining: 457ms
616:	learn: 0.3155594	total: 733ms	remaining: 455ms
617:	learn: 0.3153853	total: 734ms	remaining: 454ms
618:	learn: 

791:	learn: 0.2938984	total: 903ms	remaining: 237ms
792:	learn: 0.2936873	total: 904ms	remaining: 236ms
793:	learn: 0.2935122	total: 905ms	remaining: 235ms
794:	learn: 0.2931555	total: 906ms	remaining: 234ms
795:	learn: 0.2930053	total: 907ms	remaining: 232ms
796:	learn: 0.2926498	total: 908ms	remaining: 231ms
797:	learn: 0.2925299	total: 909ms	remaining: 230ms
798:	learn: 0.2924431	total: 910ms	remaining: 229ms
799:	learn: 0.2923062	total: 910ms	remaining: 228ms
800:	learn: 0.2921343	total: 911ms	remaining: 226ms
801:	learn: 0.2920942	total: 912ms	remaining: 225ms
802:	learn: 0.2920177	total: 913ms	remaining: 224ms
803:	learn: 0.2919341	total: 914ms	remaining: 223ms
804:	learn: 0.2917535	total: 915ms	remaining: 222ms
805:	learn: 0.2915858	total: 916ms	remaining: 221ms
806:	learn: 0.2915067	total: 917ms	remaining: 219ms
807:	learn: 0.2913923	total: 918ms	remaining: 218ms
808:	learn: 0.2912974	total: 919ms	remaining: 217ms
809:	learn: 0.2911407	total: 920ms	remaining: 216ms
810:	learn: 

983:	learn: 0.2689051	total: 1.09s	remaining: 17.7ms
984:	learn: 0.2688568	total: 1.09s	remaining: 16.6ms
985:	learn: 0.2686936	total: 1.09s	remaining: 15.5ms
986:	learn: 0.2685806	total: 1.09s	remaining: 14.4ms
987:	learn: 0.2685539	total: 1.09s	remaining: 13.2ms
988:	learn: 0.2684519	total: 1.09s	remaining: 12.1ms
989:	learn: 0.2684048	total: 1.09s	remaining: 11ms
990:	learn: 0.2682416	total: 1.09s	remaining: 9.93ms
991:	learn: 0.2681563	total: 1.09s	remaining: 8.83ms
992:	learn: 0.2681024	total: 1.09s	remaining: 7.72ms
993:	learn: 0.2678682	total: 1.1s	remaining: 6.62ms
994:	learn: 0.2677755	total: 1.1s	remaining: 5.51ms
995:	learn: 0.2676713	total: 1.1s	remaining: 4.41ms
996:	learn: 0.2675953	total: 1.1s	remaining: 3.31ms
997:	learn: 0.2675148	total: 1.1s	remaining: 2.2ms
998:	learn: 0.2674040	total: 1.1s	remaining: 1.1ms
999:	learn: 0.2672552	total: 1.1s	remaining: 0us


Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,Skl GBM,85.47486,83.860343,0.107353
2,CatBoost,85.47486,84.400527,1.307001
3,LightGBM,83.798883,82.496706,0.082435
4,Skl HistGBM,82.681564,81.047431,0.6984
5,XGBoost,82.681564,81.047431,0.087985
6,Random Forest,81.564246,80.678524,0.178827
7,AdaBoost,79.888268,79.314888,0.104515
8,Extra Trees,79.329609,78.050066,0.264173
9,Decision Tree,78.77095,77.59552,0.025262


In [67]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 6 (3pts):
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [None]:
"""
skf = model_selection.StratifiedKFold(
    # CODE HERE

"""
# YOUR CODE HERE
skf = model_selection.StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

print(x.shape)
print(y.shape)
for model_name, model in tree_classifiers.items():
    start_time = time.time()

    pred = model_selection.cross_val_predict(model, x, y,cv=skf)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

                            
# YOUR CODE HERE


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

(891, 8)
(891,)
Learning rate set to 0.009371
0:	learn: 0.6897520	total: 20.2ms	remaining: 20.1s
1:	learn: 0.6859991	total: 20.7ms	remaining: 10.3s
2:	learn: 0.6793872	total: 26.2ms	remaining: 8.7s
3:	learn: 0.6732417	total: 39.7ms	remaining: 9.88s
4:	learn: 0.6684170	total: 40.2ms	remaining: 8s
5:	learn: 0.6620759	total: 41.6ms	remaining: 6.9s
6:	learn: 0.6558995	total: 59.4ms	remaining: 8.42s
7:	learn: 0.6513215	total: 62.4ms	remaining: 7.74s
8:	learn: 0.6454558	total: 63.4ms	remaining: 6.99s
9:	learn: 0.6398967	total: 67ms	remaining: 6.63s
10:	learn: 0.6346359	total: 69.1ms	remaining: 6.21s
11:	learn: 0.6291934	total: 70ms	remaining: 5.77s
12:	learn: 0.6238934	total: 71.1ms	remaining: 5.4s
13:	learn: 0.6195660	total: 73.9ms	remaining: 5.21s
14:	learn: 0.6157544	total: 75ms	remaining: 4.93s
15:	learn: 0.6133816	total: 75.7ms	remaining: 4.65s
16:	learn: 0.6085673	total: 77.2ms	remaining: 4.46s
17:	learn: 0.6043764	total: 79.5ms	remaining: 4.33s
18:	learn: 0.5994054	total: 82ms	remaini

257:	learn: 0.3688106	total: 370ms	remaining: 1.06s
258:	learn: 0.3685887	total: 371ms	remaining: 1.06s
259:	learn: 0.3683608	total: 372ms	remaining: 1.06s
260:	learn: 0.3681700	total: 373ms	remaining: 1.06s
261:	learn: 0.3680231	total: 374ms	remaining: 1.05s
262:	learn: 0.3679026	total: 375ms	remaining: 1.05s
263:	learn: 0.3676492	total: 377ms	remaining: 1.05s
264:	learn: 0.3675259	total: 378ms	remaining: 1.05s
265:	learn: 0.3672629	total: 379ms	remaining: 1.04s
266:	learn: 0.3670753	total: 380ms	remaining: 1.04s
267:	learn: 0.3668063	total: 381ms	remaining: 1.04s
268:	learn: 0.3666314	total: 382ms	remaining: 1.04s
269:	learn: 0.3662522	total: 383ms	remaining: 1.04s
270:	learn: 0.3660590	total: 384ms	remaining: 1.03s
271:	learn: 0.3659440	total: 385ms	remaining: 1.03s
272:	learn: 0.3657219	total: 386ms	remaining: 1.03s
273:	learn: 0.3655803	total: 387ms	remaining: 1.03s
274:	learn: 0.3653696	total: 388ms	remaining: 1.02s
275:	learn: 0.3651951	total: 389ms	remaining: 1.02s
276:	learn: 

436:	learn: 0.3400056	total: 554ms	remaining: 714ms
437:	learn: 0.3398090	total: 556ms	remaining: 714ms
438:	learn: 0.3396342	total: 557ms	remaining: 712ms
439:	learn: 0.3394645	total: 558ms	remaining: 711ms
440:	learn: 0.3391438	total: 559ms	remaining: 709ms
441:	learn: 0.3391208	total: 560ms	remaining: 707ms
442:	learn: 0.3389355	total: 561ms	remaining: 706ms
443:	learn: 0.3388872	total: 562ms	remaining: 704ms
444:	learn: 0.3388417	total: 563ms	remaining: 702ms
445:	learn: 0.3387952	total: 564ms	remaining: 700ms
446:	learn: 0.3385661	total: 565ms	remaining: 698ms
447:	learn: 0.3384439	total: 565ms	remaining: 697ms
448:	learn: 0.3383657	total: 566ms	remaining: 695ms
449:	learn: 0.3383464	total: 567ms	remaining: 693ms
450:	learn: 0.3382819	total: 568ms	remaining: 691ms
451:	learn: 0.3381533	total: 569ms	remaining: 689ms
452:	learn: 0.3380615	total: 570ms	remaining: 688ms
453:	learn: 0.3379435	total: 570ms	remaining: 686ms
454:	learn: 0.3378834	total: 571ms	remaining: 684ms
455:	learn: 

In [None]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 7.1
Train with all data the best model

In [None]:
best_model = tree_classifiers["Skl GBM"].fit(x,y)
# Fit best model with all data

# YOUR CODE HERE


# Exercise 7.2 (2pts)
With your best model, generate the predicitions for test data (x_test)

In [None]:
test_pred = best_model.predict(x_test)


In [None]:
assert len(test_pred) == 418
assert np.unique(test_pred).tolist() == [0,1]

# Exercise 7.3

Submit to kaggle.

- You can use the kaggle command line app. Check https://github.com/Kaggle/kaggle-api

In [None]:
sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()

In [None]:
sub.to_csv("sub_with_cabin.csv")

In [None]:
!kaggle competitions submit -c titanic -f sub.csv -m "My submission message"