## Imports

In [None]:
# pip install --upgrade scikit-learn

In [113]:
import time
from IPython.display import clear_output
import numpy    as np
import pandas   as pd
import seaborn  as sb
import matplotlib.pyplot as plt
import sklearn  as skl

from sklearn import pipeline      # Pipeline
from sklearn import preprocessing # OrdinalEncoder, LabelEncoder
from sklearn import impute
from sklearn import compose
from sklearn import model_selection # train_test_split
from sklearn import metrics         # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import set_config

set_config(display='diagram') # Useful for display the pipeline

print("Pandas  ", pd.__version__)
print("Sklearn ", skl.__version__) # Try to use 0.24

Pandas   1.2.4
Sklearn  0.24.1


## Get the dataset
- **CLOUD = True**: Download dataset from Kaggle. Necesary for cloud enviroments like COLAB. **Specify your [kaggle credentials](https://www.kaggle.com/docs/api)**.
- **CLOUD = False**: Get the dataset from your local machine. **Specify the data path**.

In [114]:
CLOUD = False

if CLOUD:
    import os
    os.environ['KAGGLE_USERNAME'] = "your_kaggle_username"
    os.environ['KAGGLE_KEY']      = "your_kaggle_api_key"  # See https://www.kaggle.com/docs/api
    !pip install --upgrade kaggle
    !kaggle competitions download -c titanic
    DATA_PATH = "./"

else:
    DATA_PATH = "../../Datasets/Tabular/titanic/"

## Load data

In [115]:
df      = pd.read_csv(DATA_PATH + "train.csv", index_col='PassengerId')
df_test = pd.read_csv(DATA_PATH + "test.csv",  index_col='PassengerId')

print("Train DataFrame:", df.shape)
print("Test DataFrame: ", df_test.shape)

Train DataFrame: (891, 11)
Test DataFrame:  (418, 10)


## Check missings

In [116]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [117]:
df_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

# Exercise 1 (2pts):
Extract the title (Mr, Mrs, ... ) from the "Name" column.

Tips:
- split(',')[1] to get the 2nd part, and remove the surnamename
- split('.')[0] to get the 1str part, and remove the name

In [118]:
# CODE HERE get_Title_from_Name funtion
# Create this function using lambda (not def)

get_Title_from_Name = lambda x : x.split(',')[1].split('.')[0].strip()

# YOUR CODE HERE

# df2 = df.assign(Title = lambda name: ((df['Name']).split(',').split('.')))
# df = df.assign(Title = lambda x: (x['Name'].str.split('.')))
# df = df.assign(Title = lambda x: (x['Name'].str.split('.')))
# df["Title"] = df['Name'].apply(lambda x : x.split(',')[1].split('.')[0].strip())
# df = df.assign(Title = lambda x: (x['Name'].str.split(',').str.split('.')))

# title_np = np.array(df['Name'].str.split(','))
# for i in title_np:
#     del i[0]
# print(title_np)
# my_np =np.array(df["Title"])
# my_list = list(my_np)
# my_list = my_list.split('.')
# my_list2 = []


# print(df)


# raise NotImplementedError()

df['Title']      = df['Name'].map(get_Title_from_Name)
df_test['Title'] = df_test['Name'].map(get_Title_from_Name)
df.head(5)
# df_test["Title"].value_counts()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [119]:
assert df['Title'].values[0] == "Mr"
assert df['Title'].values[1] == "Mrs"
assert df['Title'].values[2] == "Miss"

assert df_test['Title'].values[0] == "Mr"
assert df_test['Title'].values[1] == "Mrs"
assert df_test['Title'].values[414] == "Dona"

# Exercise 2 (1pts):
Apply the title_dictionary to get a better information about the title. You have to overwrite the Title variable.

In [120]:
title_dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

In [121]:
# Use map to apply the prevous dict

df["Title"] =  df['Title'].map(title_dictionary)
df_test["Title"] = df_test['Title'].map(title_dictionary)
# print(len(df["Title"]))
# print(len(df_test["Title"]))

df["Title"].isnull().sum()
print(df["Title"].values[886])
print(df_test["Title"].values[417])

# YOUR CODE HERE
# raise NotImplementedError()

Officer
Master


In [122]:
assert df['Title'].values[886] == "Officer"
assert df_test['Title'].values[417] == "Master"

# Exercise OPTINAL (0pts):
Try to extract some information from the feature **Ticket**. Search on Internet if that colum has some kind of information.

In [123]:
print(df.head(5))

             Survived  Pclass  \
PassengerId                     
1                   0       3   
2                   1       1   
3                   1       3   
4                   1       1   
5                   0       3   

                                                          Name     Sex   Age  \
PassengerId                                                                    
1                                      Braund, Mr. Owen Harris    male  22.0   
2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
3                                       Heikkinen, Miss. Laina  female  26.0   
4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
5                                     Allen, Mr. William Henry    male  35.0   

             SibSp  Parch            Ticket     Fare Cabin Embarked Title  
PassengerId                                                                
1                1      0         A/5 21171   7.2500   

# Exercise OPTIONAL (0pts):
Try to extract some information from the feature **Cabin**. Search on Internet if that colum has some kind of information.

In [124]:
df_test['Cabin']

PassengerId
892      NaN
893      NaN
894      NaN
895      NaN
896      NaN
        ... 
1305     NaN
1306    C105
1307     NaN
1308     NaN
1309     NaN
Name: Cabin, Length: 418, dtype: object

# Preprocessing
For X data, notice that...
- We drop Survived because is the target variable
- We drop Name because we have extracted the Title: Mr, Mrs, ...
- We drop Ticket because it has no information -> see df.Ticket.nunique()
- We drop Cabin because it has a lot of missings (77% are missings)

Then, we identify **numerical** variables and **categorical** variables,

In [125]:
x = df.drop(columns=["Survived", 'Name', 'Ticket', 'Cabin']) # X DATA (WILL BE TRAIN+VALID DATA)
y = df["Survived"] # 0 = No, 1 = Yes

x_test = df_test.drop(columns=['Name', 'Ticket', 'Cabin']) # # X_TEST DATA (NEW DATA)

In [126]:
cat_vars  = ['Sex', 'Embarked', 'Title']         # x.select_dtypes(include=[object]).columns.values.tolist()
num_vars  = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age'] # x.select_dtypes(exclude=[object]).columns.values.tolist()

print("\nNumerical features:\n", num_vars)
print("\nCategorical features:\n", cat_vars)


Numerical features:
 ['Pclass', 'SibSp', 'Parch', 'Fare', 'Age']

Categorical features:
 ['Sex', 'Embarked', 'Title']


# Exercise 3 (2pts):
Create a **ColumnTransformer for Tree Models**. You need to create 2 pipelines (one for numerical and other for categories). Remember:
- Categorical pipeline: Some SimpleImputer -> Some Encoder
- Numerical pipeline: Some SimpleImputer -> NO Encoder

In [128]:
"""
num_preprocessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
])

cat_preporcessing = pipeline.Pipeline(steps=[
  # Some SimpleImputer here
  # Some Encoder here. Remember to handle_unknown
])

tree_prepro = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars),
], remainder='drop') # Drop other vars not specified in num_vars or cat_vars

tree_prepro
""";

# YOUR CODE HERE
num_4_treeModels = pipeline.Pipeline(steps=[('imputer', impute.SimpleImputer(strategy= 'mean', add_indicator=False)),])
#                                              ('scaler', preprocessing.StandardScaler())])

cat_4_treeModels = pipeline.Pipeline(steps= [('imputer', impute.SimpleImputer(strategy= 'constant', fill_value='missing')),
                                             ('ordinal', preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                     unknown_value=np.nan))])

#                                              ('onehot', preprocessing.OneHotEncoder(handle_unknown='ignore'))])

tree_prepro = compose.ColumnTransformer(transformers=[('num', num_4_treeModels, num_vars),
                                                ('cat', cat_4_treeModels, cat_vars)], remainder='drop')


# print(type(tree_prepro))
# print(type(num_4_treeModels))
# print(type(cat_4_treeModels))
# print(len(num_4_treeModels))
# print(len(cat_4_treeModels))
# raise NotImplementedError()
tree_prepro

In [129]:
assert type(tree_prepro)      is compose._column_transformer.ColumnTransformer
assert type(num_4_treeModels) is pipeline.Pipeline
assert type(cat_4_treeModels) is pipeline.Pipeline
assert len(num_4_treeModels) == 1
assert len(cat_4_treeModels) == 2

# Exercise 4 (1pts):
1. Complete the diccionary with some Tree Models.
2. Then we put each model in a Pipeline where:
   - first is the prepocessing with the column Transformer
   - Then is the Tree model
3. Display the fullpipeline of the LGBMClassifier

In [40]:
from sklearn.tree          import DecisionTreeClassifier
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.experimental  import enable_hist_gradient_boosting # Necesary for HistGradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

In [130]:
"""
tree_classifiers = {
  "Decision Tree":
  "Extra Trees": 
  "Random Forest":
  "AdaBoost":
  "Skl GBM":enable_hist_gradient_boosting(),
  "Skl HistGBM":
  "XGBoost":
  "LightGBM":
  "CatBoost":
tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}
tree_classifiers["LightGBM"]
""";


# YOUR CODE HERE
# raise NotImplementedError()
tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Extra Trees": ExtraTreesClassifier(),
  "Random Forest": RandomForestClassifier(),
  "AdaBoost":AdaBoostClassifier(),
  "Skl GBM":HistGradientBoostingClassifier(),#enable_hist_gradient_boosting(),
  "Skl HistGBM":HistGradientBoostingClassifier(),
  "XGBoost":XGBClassifier(),
  "LightGBM":LGBMClassifier(),
  "CatBoost":CatBoostClassifier()}

tree_classifiers = {name: pipeline.make_pipeline(tree_prepro, model) for name, model in tree_classifiers.items()}

tree_classifiers["LightGBM"]

In [131]:
for pipe in tree_classifiers.values():
    assert type(pipe) is pipeline.Pipeline

# Exercise 5 (3pts):
Define a simple split validation strategy with:
- 80% for train
- 20% for validation
- With stratification
- random_state=0

And train all the models in a for loop

In [146]:
"""
x_train, x_val, y_train, y_val = model_selection.train_test_split(
    # CODE HERE
)
"""
# YOUR CODE HERE
# raise NotImplementedError()

x_train, x_val, y_train, y_val = model_selection.train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 4)




results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

"""
for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    
    # GET PREDICTIONS USING x_val
    pred = # CODE HERE

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
"""

# YOUR CODE HERE
# raise NotImplementedError()

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
#     full_model = pipeline.Pipeline([('preprocessor', tree_prepro),('model',model_name)])
    model.fit(x_train,y_train)
    # FOR EVERY PIPELINE (PREPRO + MODEL) -> TRAIN WITH TRAIN DATA (x_train)
    
    # GET PREDICTIONS USING x_val
#     pred = full_model.predict(x_val)# CODE HERE
    pred = model.predict(x_val)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')



Learning rate set to 0.008911
0:	learn: 0.6896184	total: 12.1ms	remaining: 12.1s
1:	learn: 0.6858162	total: 13ms	remaining: 6.48s
2:	learn: 0.6796524	total: 14.6ms	remaining: 4.87s
3:	learn: 0.6738140	total: 16.2ms	remaining: 4.04s
4:	learn: 0.6690768	total: 17.1ms	remaining: 3.4s
5:	learn: 0.6636089	total: 18.6ms	remaining: 3.09s
6:	learn: 0.6605114	total: 19.4ms	remaining: 2.75s
7:	learn: 0.6558070	total: 20.4ms	remaining: 2.53s
8:	learn: 0.6504937	total: 21.9ms	remaining: 2.42s
9:	learn: 0.6449112	total: 23.5ms	remaining: 2.33s
10:	learn: 0.6396662	total: 25.1ms	remaining: 2.26s
11:	learn: 0.6347641	total: 26.7ms	remaining: 2.2s
12:	learn: 0.6297268	total: 28.3ms	remaining: 2.15s
13:	learn: 0.6246363	total: 29.9ms	remaining: 2.1s
14:	learn: 0.6208196	total: 30.8ms	remaining: 2.02s
15:	learn: 0.6159417	total: 32.3ms	remaining: 1.99s
16:	learn: 0.6113204	total: 33.9ms	remaining: 1.96s
17:	learn: 0.6068860	total: 35.4ms	remaining: 1.93s
18:	learn: 0.6027494	total: 36.7ms	remaining: 1.8

221:	learn: 0.3819934	total: 340ms	remaining: 1.19s
222:	learn: 0.3817352	total: 342ms	remaining: 1.19s
223:	learn: 0.3814012	total: 344ms	remaining: 1.19s
224:	learn: 0.3810893	total: 346ms	remaining: 1.19s
225:	learn: 0.3810604	total: 346ms	remaining: 1.19s
226:	learn: 0.3808170	total: 348ms	remaining: 1.19s
227:	learn: 0.3805366	total: 350ms	remaining: 1.18s
228:	learn: 0.3804558	total: 351ms	remaining: 1.18s
229:	learn: 0.3802120	total: 353ms	remaining: 1.18s
230:	learn: 0.3798912	total: 354ms	remaining: 1.18s
231:	learn: 0.3795819	total: 356ms	remaining: 1.18s
232:	learn: 0.3792785	total: 358ms	remaining: 1.18s
233:	learn: 0.3788887	total: 360ms	remaining: 1.18s
234:	learn: 0.3785848	total: 361ms	remaining: 1.18s
235:	learn: 0.3784633	total: 363ms	remaining: 1.18s
236:	learn: 0.3782415	total: 365ms	remaining: 1.17s
237:	learn: 0.3781073	total: 366ms	remaining: 1.17s
238:	learn: 0.3779102	total: 368ms	remaining: 1.17s
239:	learn: 0.3776412	total: 369ms	remaining: 1.17s
240:	learn: 

443:	learn: 0.3420370	total: 683ms	remaining: 856ms
444:	learn: 0.3419168	total: 685ms	remaining: 855ms
445:	learn: 0.3418418	total: 687ms	remaining: 853ms
446:	learn: 0.3416021	total: 689ms	remaining: 852ms
447:	learn: 0.3415051	total: 690ms	remaining: 850ms
448:	learn: 0.3412018	total: 692ms	remaining: 849ms
449:	learn: 0.3410943	total: 693ms	remaining: 848ms
450:	learn: 0.3410623	total: 695ms	remaining: 846ms
451:	learn: 0.3409974	total: 696ms	remaining: 844ms
452:	learn: 0.3408662	total: 698ms	remaining: 843ms
453:	learn: 0.3408529	total: 699ms	remaining: 841ms
454:	learn: 0.3407462	total: 701ms	remaining: 839ms
455:	learn: 0.3405822	total: 703ms	remaining: 838ms
456:	learn: 0.3404276	total: 704ms	remaining: 837ms
457:	learn: 0.3403339	total: 706ms	remaining: 836ms
458:	learn: 0.3402230	total: 708ms	remaining: 835ms
459:	learn: 0.3400339	total: 710ms	remaining: 833ms
460:	learn: 0.3399164	total: 712ms	remaining: 832ms
461:	learn: 0.3398431	total: 713ms	remaining: 831ms
462:	learn: 

645:	learn: 0.3178999	total: 1.01s	remaining: 552ms
646:	learn: 0.3178362	total: 1.01s	remaining: 551ms
647:	learn: 0.3177284	total: 1.01s	remaining: 550ms
648:	learn: 0.3177189	total: 1.01s	remaining: 548ms
649:	learn: 0.3175114	total: 1.01s	remaining: 546ms
650:	learn: 0.3173113	total: 1.01s	remaining: 545ms
651:	learn: 0.3172504	total: 1.02s	remaining: 543ms
652:	learn: 0.3170954	total: 1.02s	remaining: 542ms
653:	learn: 0.3169713	total: 1.02s	remaining: 540ms
654:	learn: 0.3168435	total: 1.02s	remaining: 538ms
655:	learn: 0.3167559	total: 1.02s	remaining: 537ms
656:	learn: 0.3166810	total: 1.02s	remaining: 536ms
657:	learn: 0.3166323	total: 1.03s	remaining: 534ms
658:	learn: 0.3164346	total: 1.03s	remaining: 533ms
659:	learn: 0.3163873	total: 1.03s	remaining: 531ms
660:	learn: 0.3163734	total: 1.03s	remaining: 529ms
661:	learn: 0.3163276	total: 1.03s	remaining: 528ms
662:	learn: 0.3163077	total: 1.03s	remaining: 526ms
663:	learn: 0.3160470	total: 1.04s	remaining: 525ms
664:	learn: 

851:	learn: 0.2934538	total: 1.35s	remaining: 234ms
852:	learn: 0.2933287	total: 1.35s	remaining: 232ms
853:	learn: 0.2932713	total: 1.35s	remaining: 231ms
854:	learn: 0.2931652	total: 1.35s	remaining: 229ms
855:	learn: 0.2930010	total: 1.35s	remaining: 228ms
856:	learn: 0.2928545	total: 1.35s	remaining: 226ms
857:	learn: 0.2927292	total: 1.36s	remaining: 224ms
858:	learn: 0.2926508	total: 1.36s	remaining: 223ms
859:	learn: 0.2925639	total: 1.36s	remaining: 221ms
860:	learn: 0.2925340	total: 1.36s	remaining: 220ms
861:	learn: 0.2923904	total: 1.36s	remaining: 218ms
862:	learn: 0.2922307	total: 1.36s	remaining: 217ms
863:	learn: 0.2922058	total: 1.37s	remaining: 215ms
864:	learn: 0.2921546	total: 1.37s	remaining: 213ms
865:	learn: 0.2919705	total: 1.37s	remaining: 212ms
866:	learn: 0.2918929	total: 1.37s	remaining: 210ms
867:	learn: 0.2918501	total: 1.37s	remaining: 209ms
868:	learn: 0.2918009	total: 1.38s	remaining: 207ms
869:	learn: 0.2915742	total: 1.38s	remaining: 206ms
870:	learn: 

Unnamed: 0,Model,Accuracy,Bal Acc.,Time
1,CatBoost,87.150838,85.494071,2.051638
2,AdaBoost,84.916201,83.945982,0.094747
3,Random Forest,83.798883,81.68643,0.178522
4,XGBoost,83.798883,82.766798,0.146609
5,LightGBM,83.798883,82.766798,0.09674
6,Skl GBM,82.681564,81.317523,0.664222
7,Skl HistGBM,82.681564,81.317523,0.664224
8,Extra Trees,80.446927,79.229249,0.153585
9,Decision Tree,75.977654,75.052701,0.021945


In [147]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 6 (3pts):
Define a 10 Fold cross validation strategy with:
- With stratification
- shuffle=True
- random_state=0

And train all the models in a for loop.

Tip you can use **[cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)** for both training and predict with 

In [149]:
"""
skf = model_selection.StratifiedKFold(
    # CODE HERE
)
"""
# YOUR CODE HERE
# raise NotImplementedError()
from sklearn.model_selection import cross_val_predict

skf = model_selection.StratifiedKFold(n_splits=10, random_state= 0, shuffle= True)

results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Bal Acc.': [], 'Time': []})

"""
for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = # CODE HERE

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)
                              
                              
"""

# YOUR CODE HERE
# raise NotImplementedError()


for model_name, model in tree_classifiers.items():
    start_time = time.time()
        
    # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
    pred = cross_val_predict(model, x, y, cv=skf)

    total_time = time.time() - start_time

    results = results.append({"Model":    model_name,
                              "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)


results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['Accuracy', 'Bal Acc.'], vmin=0, vmax=100, color='#5fba7d')

ValueError: Found input variables with inconsistent numbers of samples: [179, 891]

In [150]:
assert results_ord["Accuracy"].min() > 75
assert results_ord["Bal Acc."].min() > 75
assert len(results_ord) == 9

# Exercise 7.1
Train with all data the best model

In [151]:
# best_model = tree_classifiers["SELECT MY BEST MODEL HERE"]

# Fit best model with all data

# YOUR CODE HERE
best_model = tree_classifiers["CatBoost"]

best_model.fit(x,y)

# raise NotImplementedError()

Learning rate set to 0.009807
0:	learn: 0.6861635	total: 6.99ms	remaining: 6.98s
1:	learn: 0.6793399	total: 14.7ms	remaining: 7.34s
2:	learn: 0.6730222	total: 19.5ms	remaining: 6.49s
3:	learn: 0.6678806	total: 21.3ms	remaining: 5.3s
4:	learn: 0.6612309	total: 24.4ms	remaining: 4.85s
5:	learn: 0.6553507	total: 28.2ms	remaining: 4.67s
6:	learn: 0.6497975	total: 31.3ms	remaining: 4.44s
7:	learn: 0.6436043	total: 34.6ms	remaining: 4.29s
8:	learn: 0.6377382	total: 37.8ms	remaining: 4.17s
9:	learn: 0.6323072	total: 41ms	remaining: 4.06s
10:	learn: 0.6263620	total: 44.4ms	remaining: 3.99s
11:	learn: 0.6206358	total: 46.3ms	remaining: 3.81s
12:	learn: 0.6165163	total: 47.3ms	remaining: 3.59s
13:	learn: 0.6113051	total: 49ms	remaining: 3.45s
14:	learn: 0.6062257	total: 50.7ms	remaining: 3.33s
15:	learn: 0.6011274	total: 52.5ms	remaining: 3.23s
16:	learn: 0.5966276	total: 53.8ms	remaining: 3.11s
17:	learn: 0.5924592	total: 55.6ms	remaining: 3.03s
18:	learn: 0.5890787	total: 57ms	remaining: 2.94s

183:	learn: 0.3833593	total: 331ms	remaining: 1.47s
184:	learn: 0.3830832	total: 334ms	remaining: 1.47s
185:	learn: 0.3827427	total: 336ms	remaining: 1.47s
186:	learn: 0.3824440	total: 338ms	remaining: 1.47s
187:	learn: 0.3823410	total: 340ms	remaining: 1.47s
188:	learn: 0.3819090	total: 342ms	remaining: 1.47s
189:	learn: 0.3815127	total: 343ms	remaining: 1.46s
190:	learn: 0.3811647	total: 345ms	remaining: 1.46s
191:	learn: 0.3807247	total: 348ms	remaining: 1.46s
192:	learn: 0.3804764	total: 350ms	remaining: 1.46s
193:	learn: 0.3801553	total: 351ms	remaining: 1.46s
194:	learn: 0.3801075	total: 352ms	remaining: 1.45s
195:	learn: 0.3798631	total: 354ms	remaining: 1.45s
196:	learn: 0.3794612	total: 356ms	remaining: 1.45s
197:	learn: 0.3792859	total: 358ms	remaining: 1.45s
198:	learn: 0.3790455	total: 359ms	remaining: 1.45s
199:	learn: 0.3789514	total: 361ms	remaining: 1.44s
200:	learn: 0.3784270	total: 363ms	remaining: 1.44s
201:	learn: 0.3781444	total: 364ms	remaining: 1.44s
202:	learn: 

381:	learn: 0.3448818	total: 672ms	remaining: 1.09s
382:	learn: 0.3446869	total: 674ms	remaining: 1.08s
383:	learn: 0.3446662	total: 675ms	remaining: 1.08s
384:	learn: 0.3445672	total: 677ms	remaining: 1.08s
385:	learn: 0.3444063	total: 679ms	remaining: 1.08s
386:	learn: 0.3442557	total: 681ms	remaining: 1.08s
387:	learn: 0.3441298	total: 683ms	remaining: 1.08s
388:	learn: 0.3439009	total: 685ms	remaining: 1.07s
389:	learn: 0.3436702	total: 687ms	remaining: 1.07s
390:	learn: 0.3435108	total: 689ms	remaining: 1.07s
391:	learn: 0.3434226	total: 691ms	remaining: 1.07s
392:	learn: 0.3433509	total: 692ms	remaining: 1.07s
393:	learn: 0.3433422	total: 693ms	remaining: 1.07s
394:	learn: 0.3432666	total: 695ms	remaining: 1.06s
395:	learn: 0.3430559	total: 697ms	remaining: 1.06s
396:	learn: 0.3429139	total: 699ms	remaining: 1.06s
397:	learn: 0.3428605	total: 700ms	remaining: 1.06s
398:	learn: 0.3427778	total: 702ms	remaining: 1.06s
399:	learn: 0.3426863	total: 704ms	remaining: 1.05s
400:	learn: 

573:	learn: 0.3224305	total: 1.01s	remaining: 752ms
574:	learn: 0.3223546	total: 1.02s	remaining: 751ms
575:	learn: 0.3223051	total: 1.02s	remaining: 749ms
576:	learn: 0.3221910	total: 1.02s	remaining: 747ms
577:	learn: 0.3220585	total: 1.02s	remaining: 746ms
578:	learn: 0.3219311	total: 1.02s	remaining: 744ms
579:	learn: 0.3217893	total: 1.02s	remaining: 742ms
580:	learn: 0.3216851	total: 1.03s	remaining: 741ms
581:	learn: 0.3216791	total: 1.03s	remaining: 738ms
582:	learn: 0.3215829	total: 1.03s	remaining: 737ms
583:	learn: 0.3214960	total: 1.03s	remaining: 735ms
584:	learn: 0.3214618	total: 1.03s	remaining: 734ms
585:	learn: 0.3213388	total: 1.03s	remaining: 732ms
586:	learn: 0.3212608	total: 1.04s	remaining: 730ms
587:	learn: 0.3211811	total: 1.04s	remaining: 729ms
588:	learn: 0.3209971	total: 1.04s	remaining: 727ms
589:	learn: 0.3207974	total: 1.04s	remaining: 726ms
590:	learn: 0.3206881	total: 1.04s	remaining: 724ms
591:	learn: 0.3205896	total: 1.05s	remaining: 722ms
592:	learn: 

757:	learn: 0.3026879	total: 1.35s	remaining: 432ms
758:	learn: 0.3026735	total: 1.35s	remaining: 430ms
759:	learn: 0.3025414	total: 1.36s	remaining: 429ms
760:	learn: 0.3024945	total: 1.36s	remaining: 427ms
761:	learn: 0.3024214	total: 1.36s	remaining: 425ms
762:	learn: 0.3022518	total: 1.36s	remaining: 423ms
763:	learn: 0.3021781	total: 1.36s	remaining: 422ms
764:	learn: 0.3020632	total: 1.37s	remaining: 420ms
765:	learn: 0.3019295	total: 1.37s	remaining: 418ms
766:	learn: 0.3016740	total: 1.37s	remaining: 416ms
767:	learn: 0.3016036	total: 1.37s	remaining: 415ms
768:	learn: 0.3013939	total: 1.37s	remaining: 413ms
769:	learn: 0.3012950	total: 1.38s	remaining: 411ms
770:	learn: 0.3011996	total: 1.38s	remaining: 409ms
771:	learn: 0.3011153	total: 1.38s	remaining: 408ms
772:	learn: 0.3010619	total: 1.38s	remaining: 406ms
773:	learn: 0.3009844	total: 1.38s	remaining: 404ms
774:	learn: 0.3008537	total: 1.39s	remaining: 402ms
775:	learn: 0.3005240	total: 1.39s	remaining: 400ms
776:	learn: 

949:	learn: 0.2838254	total: 1.7s	remaining: 89.2ms
950:	learn: 0.2837231	total: 1.7s	remaining: 87.5ms
951:	learn: 0.2836370	total: 1.7s	remaining: 85.7ms
952:	learn: 0.2835413	total: 1.7s	remaining: 83.9ms
953:	learn: 0.2835048	total: 1.7s	remaining: 82.1ms
954:	learn: 0.2833916	total: 1.7s	remaining: 80.3ms
955:	learn: 0.2831346	total: 1.71s	remaining: 78.6ms
956:	learn: 0.2830740	total: 1.71s	remaining: 76.8ms
957:	learn: 0.2830424	total: 1.71s	remaining: 75ms
958:	learn: 0.2828219	total: 1.71s	remaining: 73.2ms
959:	learn: 0.2827171	total: 1.71s	remaining: 71.4ms
960:	learn: 0.2826265	total: 1.72s	remaining: 69.7ms
961:	learn: 0.2825759	total: 1.72s	remaining: 67.9ms
962:	learn: 0.2824679	total: 1.72s	remaining: 66.1ms
963:	learn: 0.2823053	total: 1.72s	remaining: 64.3ms
964:	learn: 0.2821152	total: 1.72s	remaining: 62.5ms
965:	learn: 0.2820514	total: 1.73s	remaining: 60.7ms
966:	learn: 0.2818388	total: 1.73s	remaining: 59ms
967:	learn: 0.2816309	total: 1.73s	remaining: 57.2ms
968

# Exercise 7.2 (2pts)
With your best model, generate the predicitions for test data (x_test)

In [154]:
# test_pred = # Get the predictions for x_test

# YOUR CODE HERE

test_pred = best_model.predict(x_test)
# raise NotImplementedError()

In [155]:
assert len(test_pred) == 418
assert np.unique(test_pred).tolist() == [0,1]

# Exercise 7.3

Submit to kaggle.

- You can use the kaggle command line app. Check https://github.com/Kaggle/kaggle-api

In [156]:
sub = pd.DataFrame(test_pred, index=x_test.index, columns=["Survived"])
sub.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [157]:
sub.to_csv("sub.csv")

In [158]:
!kaggle competitions submit -c titanic -f sub.csv -m "My submission message"

'kaggle' is not recognized as an internal or external command,
operable program or batch file.
