<h1><font size="6" color="green"><b> LightAutoML in Titanic: </b></font></h1>

<h2><font size="4" color="blue"><b> 80% accuracy in several lines of code </b></font></h2>

Tutorial for improve skills: 'LightAutoML vs Titanic: 80% accuracy in several lines of code' (from Alexander Ryzhkov) by Marcus Mariano

**For more information about Marcus Mariano: [Web site](https://marcusmariano.github.io/mmariano/)**  

**LightAutoML vs Titanic: 80% accuracy in several lines of code: [Alexander Ryzhkov](https://towardsdatascience.com/lightautoml-preset-usage-tutorial-2cce7da6f936)**


**LightAutoML - automatic model creation framework: [LightAutoML](https://github.com/sberbank-ai-lab/LightAutoML)**



# Import Packages

In [1]:

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from matplotlib import pyplot as plt
import seaborn as sns

sns.set(style="darkgrid", color_codes=True)
%matplotlib inline

# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

# Imports from LightAutoML package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

# Datasets load

In [2]:
%%time

data = pd.read_csv('data/train.csv')
print(data.shape)
data.head()

(891, 12)
Wall time: 17 ms


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test = pd.read_csv('data/test.csv')
print(test.shape)
test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
!ls ./data/

gender_submission.csv
test.csv
train.csv


In [5]:
submission = pd.read_csv('data/gender_submission.csv')
print(submission.shape)
submission.head()

(418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [8]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Additional expert features creation block

In [6]:
train_data = data.copy()
test_data = test.copy()

def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

def create_extra_features(data):
    data['Ticket_type'] = data['Ticket'].map(lambda x: x[0:3])
    data['Name_Words_Count'] = data['Name'].map(lambda x: len(x.split()))
    data['Has_Cabin'] = data["Cabin"].map(lambda x: 1 - int(type(x) == float))
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['CategoricalFare'] = pd.qcut(data['Fare'], 5).astype(str)
    data['CategoricalAge'] = pd.cut(data['Age'], 5).astype(str)
    
    data['Title'] = data['Name'].apply(get_title).replace(['Lady', 'Countess','Capt', 
                                                           'Col','Don', 'Dr', 'Major', 
                                                           'Rev', 'Sir', 'Jonkheer', 
                                                           'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')
    data['Title'] = data['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, 
                                       "Rare": 5}).fillna(0)
    
    return data



train_data = create_extra_features(train_data)
test_data = create_extra_features(test_data)

print(train_data.shape, test_data.shape)
train_data.head()

(891, 19) (418, 18)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_type,Name_Words_Count,Has_Cabin,FamilySize,CategoricalFare,CategoricalAge,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,A/5,4,0,2,"(-0.001, 7.854]","(16.336, 32.252]",1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,PC,7,1,2,"(39.688, 512.329]","(32.252, 48.168]",3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,STO,3,0,1,"(7.854, 10.5]","(16.336, 32.252]",2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,113,7,1,2,"(39.688, 512.329]","(32.252, 48.168]",3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,373,4,0,1,"(7.854, 10.5]","(32.252, 48.168]",1


In [7]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
train_data.isnull().sum()

PassengerId           0
Survived              0
Pclass                0
Name                  0
Sex                   0
Age                 177
SibSp                 0
Parch                 0
Ticket                0
Fare                  0
Cabin               687
Embarked              2
Ticket_type           0
Name_Words_Count      0
Has_Cabin             0
FamilySize            0
CategoricalFare       0
CategoricalAge        0
Title                 0
dtype: int64

# Division of training, validation and testing databases

In [13]:
%%time

# Database division into training and testing.
from sklearn.model_selection import train_test_split

SEED = 0

# Separation of data into a training set and a test set
df_train, df_valid = train_test_split(train_data, 
                                      test_size = 0.2, 
                                      stratify = data["Survived"], 
                                      random_state = SEED)

print(f"Parts sizes: df_train = {df_train.shape}, df_valid = {df_valid.shape}")

Parts sizes: df_train = (712, 19), df_valid = (179, 19)
Wall time: 27 ms


In [17]:
# logging.info("Parts sizes: tr_data = {}, valid_data = {}".format(tr_data.shape, 
#                                                                  valid_data.shape))

# LightAutoML preset usage 

## Create Task object

    ‘binary’ for binary classification
    ‘reg’ for regression and
    ‘multiclass’ for multiclass classification

In [18]:
def f1_metric(y_true, y_pred):
    return f1_score(y_true, (y_pred > 0.5).astype(int))

task = Task('binary', metric = f1_metric)

## Setup columns roles

In [20]:

roles = {'target': 'Survived',
         'drop': ['PassengerId', 'Name', 'Ticket']}

## Create AutoML model from preset

In [5]:
# !cat /proc/cpuinfo

In [3]:
!lscpu

'lscpu' nÆo ‚ reconhecido como um comando interno
ou externo, um programa oper vel ou um arquivo em lotes.


In [26]:
automl = TabularAutoML(task = task, 
                       timeout = 250, # 600 seconds = 10 minutes
                       memory_limit = 6, # Optimal run for your pc
                       cpu_limit = 4, # Optimal CPU for your pc
                       general_params = {'use_algos': [['linear_l2', 
                                         'lgb', 'lgb_tuned']]})

__Out-of-Fold (OOF for short) predictions__

In [27]:
oof_pred = automl.fit_predict(df_train, roles = roles)

Start automl preset with listed constraints:
- time: 600 seconds
- cpus: 4 cores
- memory: 4 gb

Train data shape: (712, 19)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 599.5820004940033 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.0
Linear model: C = 0.0001 score = 0.25396825396825395
Linear model: C = 0.0005 score = 0.5432098765432098
Linear model: C = 0.001 score = 0.6170212765957446
Linear model: C = 0.005 score = 0.7216494845360825
Linear model: C = 0.01 score = 0.7272727272727274
Linear model: C = 0.05 score = 0.7722772277227722
Linear model: C = 0.1 score = 0.7599999999999999
Linear model: C = 0.5 score = 0.764705882352941

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.0
Linear model: C =

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.466349	valid's Opt metric: 0.723404
[200]	valid's binary_logloss: 0.418249	valid's Opt metric: 0.75
[300]	valid's binary_logloss: 0.412369	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.413927	valid's Opt metric: 0.787879
[500]	valid's binary_logloss: 0.420235	valid's Opt metric: 0.78
Early stopping, best iteration is:
[370]	valid's binary_logloss: 0.410465	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.461124	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.419505	valid's Opt metric: 0.736842
[300]	valid's binary_logloss: 0.413779	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.417196	valid's Opt metric:

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.466986	valid's Opt metric: 0.723404
[200]	valid's binary_logloss: 0.41939	valid's Opt metric: 0.75
[300]	valid's binary_logloss: 0.411011	valid's Opt metric: 0.75
[400]	valid's binary_logloss: 0.414926	valid's Opt metric: 0.77551
[500]	valid's binary_logloss: 0.420762	valid's Opt metric: 0.76
Early stopping, best iteration is:
[312]	valid's binary_logloss: 0.409659	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.464961	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.420391	valid's Opt metric: 0.736842
[300]	valid's 

[100]	valid's binary_logloss: 0.464949	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.421811	valid's Opt metric: 0.75
[300]	valid's binary_logloss: 0.411422	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.412849	valid's Opt metric: 0.78
[500]	valid's binary_logloss: 0.418257	valid's Opt metric: 0.78
Early stopping, best iteration is:
[378]	valid's binary_logloss: 0.409507	valid's Opt metric: 0.787879
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.465737	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.423134	valid's Opt metric: 0.723404
[300]	valid's binary_logloss: 0.411611	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.413551	valid's Opt metric: 0.78
[500]	valid's binary_logloss: 0.41985	valid's Opt metric: 0.

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.466972	valid's Opt metric: 0.695652
[200]	valid's binary_logloss: 0.421167	valid's Opt metric: 0.736842
[300]	valid's binary_logloss: 0.414904	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.417592	valid's Opt metric: 0.78
Early stopping, best iteration is:
[294]	valid's binary_logloss: 0.413944	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.464769	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.417998	valid's Opt metric: 0.762887
[300]	valid's binary_logloss: 0.407842	valid's Opt metric: 0.77551
[400]	valid's binary_logloss: 0.411921	valid's Opt metric: 0.77551
Early stopping, best iteration is:
[252]	valid's binary_

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.469256	valid's Opt metric: 0.688889
[200]	valid's binary_logloss: 0.421481	valid's Opt metric: 0.75
[300]	valid's binary_logloss: 0.413313	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.41615	valid's Opt metric: 0.762887
[500]	valid's binary_logloss: 0.420325	valid's Opt metric: 0.78
Early stopping, best iteration is:
[309]	valid's binary_logloss: 0.41217	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.466651	valid's Opt metric: 0.695652
[200]	valid's binary_logloss: 0.423998	valid's Opt metric: 0.723404
[300]	vali

[400]	valid's binary_logloss: 0.428508	valid's Opt metric: 0.731183
Early stopping, best iteration is:
[216]	valid's binary_logloss: 0.450127	valid's Opt metric: 0.73913
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.465083	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.423267	valid's Opt metric: 0.723404
[300]	valid's binary_logloss: 0.414095	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.415229	valid's Opt metric: 0.78
[500]	valid's binary_logloss: 0.422441	valid's Opt metric: 0.78
Early stopping, best iteration is:
[348]	valid's binary_logloss: 0.414066	valid's Opt metric: 0.787879
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.465413	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.41899	valid's Opt metric: 0.723404
[300]	valid's binary_logloss: 0.408071	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.409592	valid's Opt metric: 0.77551
[500]	valid's binary_logloss: 0.414776	valid's Opt metric: 0.787879
Early stopping, best iteration is:
[303]	valid's binary_logloss: 0.407487	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.464019	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.420011	valid's Opt metric: 0.736842
[3

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.480074	valid's Opt metric: 0.730769
Early stopping, best iteration is:
[50]	valid's binary_logloss: 0.444981	valid's Opt metric: 0.737864

===== Start working with fold 4 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.337548	valid's Opt metric: 0.834783
Early stopping, best iteration is:
[38]	valid's binary_logloss: 0.358285	valid's Opt metric: 0.867925
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Time left 377.79156517982483
Blending: Optimization starts with equal weights and score 0.7683397683397682
Blending, iter 0: score = 0.7840909090909092, weights = [0.15324706 0.         0.84675294]
Blending, iter 1: score = 0.7840909090909092, weights = [0.15324706 0.         0.84675294]
No score update. Terminated

Automl preset training completed in 222.34 seconds.


In [28]:
222/60

3.7

## Predict to validation data and check scores

In [30]:
valid_pred = df_valid.copy()

valid_pred = automl.predict(df_valid)

In [34]:
def acc_score(y_true, y_pred):
    return accuracy_score(y_true, (y_pred > 0.5).astype(int))

print(f"OOF acc: {acc_score(df_train['Survived'].values, oof_pred.data[:, 0])}")
print(f"VAL acc: {acc_score(df_valid['Survived'].values, valid_pred.data[:, 0])}")

OOF acc: 0.8398876404494382
VAL acc: 0.7988826815642458


## Create LightAutoML model with time utilization

In [44]:
tautoml = TabularUtilizedAutoML(task = task, 
                                timeout = 250, # 600 seconds = 10 minutes
                                memory_limit = 6, # Optimal run for your pc
                                cpu_limit = 4, # Optimal CPU for your pc
                                general_params = {'use_algos': [['linear_l2', 
                                                  'lgb', 'lgb_tuned']]})

oof_pred = tautoml.fit_predict(df_train, roles = roles)

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found general_params in kwargs, need to combine
Merged variant for general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']], 'return_all_predictions': False}
Start automl preset with listed constraints:
- time: 249.99300384521484 seconds
- cpus: 4 cores
- memory: 6 gb

Train data shape: (712, 19)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 234.49400234222412 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.0
Linear model: C = 0.0001 score = 0.25396825396825395
Linear model: C = 0.0005 score = 0.5432098765432098
Linear model: C = 0.001 score = 0.6170212765957446
Linear model: C = 0.005 score = 0.7216494845360825
Linear model: C = 0.01 score = 0.7272727272727274
Linear mo

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.461124	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.419505	valid's Opt metric: 0.736842
[300]	valid's binary_logloss: 0.413779	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.417196	valid's Opt metric: 0.77551
[500]	valid's binary_logloss: 0.430463	valid's Opt metric: 0.78
Early stopping, best iteration is:
[330]	valid's binary_logloss: 0.411555	valid's Opt metric: 0.767677
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.465599	valid's Opt metric: 0.73913
[200]	valid's binary_logloss: 0.420179	valid's Opt metric: 0.736842
[300]	

[500]	valid's binary_logloss: 0.420762	valid's Opt metric: 0.76
Early stopping, best iteration is:
[312]	valid's binary_logloss: 0.409659	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.464961	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.420391	valid's Opt metric: 0.736842
[300]	valid's binary_logloss: 0.41216	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.412525	valid's Opt metric: 0.792079
[500]	valid's binary_logloss: 0.418992	valid's Opt metric: 0.8
[600]	valid's binary_logloss: 0.429463	valid's Opt metric: 0.787879
Early stopping, best iteration is:
[413]	valid's binary_logloss: 0.411273	valid's Opt metric: 0.792079
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.465737	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.423134	valid's Opt metric: 0.723404
[300]	valid's binary_logloss: 0.411611	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.413551	valid's Opt metric: 0.78
[500]	valid's binary_logloss: 0.41985	valid's Opt metric: 0.787879
Early stopping, best iteration is:
[371]	valid's binary_logloss: 0.409777	valid's Opt metric: 0.787879
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.466963	valid's Opt metric: 0.695652
[200]	valid's binary_logloss: 0.418471	valid's Opt metric: 0.762887
[300]

[400]	valid's binary_logloss: 0.413795	valid's Opt metric: 0.78
Early stopping, best iteration is:
[295]	valid's binary_logloss: 0.412682	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.465013	valid's Opt metric: 0.695652
[200]	valid's binary_logloss: 0.421378	valid's Opt metric: 0.709677
[300]	valid's binary_logloss: 0.413165	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.410788	valid's Opt metric: 0.78
[500]	valid's binary_logloss: 0.418191	valid's Opt metric: 0.78
Early stopping, best iteration is:
[350]	valid's binary_logloss: 0.411462	valid's Opt metric: 0.787879
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.464769	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.417998	valid's Opt metric: 0.762887
[300]	valid's binary_logloss: 0.407842	valid's Opt metric: 0.77551
[400]	valid's binary_logloss: 0.411921	valid's Opt metric: 0.77551
Early stopping, best iteration is:
[252]	valid's binary_logloss: 0.410379	valid's Opt metric: 0.77551
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.464731	valid's Opt metric: 0.695652
[200]	valid's binary_logloss: 0.422409	valid's Opt metric: 0.723404
[300]	valid's binary_logloss: 0.413033	valid's Opt metric: 0.762887
[40

[300]	valid's binary_logloss: 0.413313	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.41615	valid's Opt metric: 0.762887
[500]	valid's binary_logloss: 0.420325	valid's Opt metric: 0.78
Early stopping, best iteration is:
[309]	valid's binary_logloss: 0.41217	valid's Opt metric: 0.762887
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.466651	valid's Opt metric: 0.695652
[200]	valid's binary_logloss: 0.423998	valid's Opt metric: 0.723404
[300]	valid's binary_logloss: 0.415691	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.414395	valid's Opt metric: 0.78
[500]	valid's binary_logloss: 0.421084	valid's Opt metric: 0.787879
Early stopping, best iteration is:
[368]	valid's binary_logloss: 0.413199	valid's Opt metric: 0.77551
Lvl_0_Pipe_1_Mod_1_L


===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.465083	valid's Opt metric: 0.709677
[200]	valid's binary_logloss: 0.423267	valid's Opt metric: 0.723404
[300]	valid's binary_logloss: 0.414095	valid's Opt metric: 0.762887
[400]	valid's binary_logloss: 0.415229	valid's Opt metric: 0.78
[500]	valid's binary_logloss: 0.422441	valid's Opt metric: 0.78
Early stopping, best iteration is:
[348]	valid's binary_logloss: 0.414066	valid's Opt metric: 0.787879
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.466364	valid's Opt metric: 0.717391
[200]	valid's binary_logloss: 0.421386	valid's Opt metric: 0.75
[300]	valid's binary_logloss: 0.411641	valid's Opt metric:

In [45]:
valid_pred = df_valid.copy()

valid_pred = tautoml.predict(df_valid)

print(f"OOF acc: {acc_score(df_train['Survived'].values, oof_pred.data[:, 0])}")
print(f"VAL acc: {acc_score(df_valid['Survived'].values, valid_pred.data[:, 0])}")

OOF acc: 0.8398876404494382
VAL acc: 0.7988826815642458


## Retrain selected model on the full dataset and predict for the real test

In [46]:
tautoml = TabularUtilizedAutoML(task = task, 
                                timeout = 600, # 600 seconds = 10 minutes
                                memory_limit = 6, # Optimal run for your pc
                                cpu_limit = 4, # Optimal CPU for your pc
                                general_params = {'use_algos': [['linear_l2', 
                                                  'lgb', 'lgb_tuned']]})

oof_pred = tautoml.fit_predict(data, roles = roles)

test_pred = tautoml.predict(test)

Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found general_params in kwargs, need to combine
Merged variant for general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']], 'return_all_predictions': False}
Start automl preset with listed constraints:
- time: 599.9920039176941 seconds
- cpus: 4 cores
- memory: 6 gb

Train data shape: (891, 12)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 599.114007472992 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.0
Linear model: C = 0.0001 score = 0.13513513513513511
Linear model: C = 0.0005 score = 0.7286821705426356
Linear model: C = 0.001 score = 0.7480916030534351
Linear model: C = 0.005 score = 0.7480916030534351
Linear model: C = 0.01 score = 0.732824427480916

===== Start 

[100]	valid's binary_logloss: 0.446731	valid's Opt metric: 0.733333
[200]	valid's binary_logloss: 0.376882	valid's Opt metric: 0.752
[300]	valid's binary_logloss: 0.350854	valid's Opt metric: 0.752
[400]	valid's binary_logloss: 0.342279	valid's Opt metric: 0.784
[500]	valid's binary_logloss: 0.337706	valid's Opt metric: 0.774194
[600]	valid's binary_logloss: 0.339547	valid's Opt metric: 0.774194
[700]	valid's binary_logloss: 0.339807	valid's Opt metric: 0.784
Early stopping, best iteration is:
[506]	valid's binary_logloss: 0.337773	valid's Opt metric: 0.793651
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.455397	valid's Opt metric: 0.705882
[200]	valid's binary_logloss: 0.390933	valid's Opt metric: 0.752
[300]	valid's binary_logloss: 0.365628	valid's Opt metric: 

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.46488	valid's Opt metric: 0.728814
[200]	valid's binary_logloss: 0.394266	valid's Opt metric: 0.758065
[300]	valid's binary_logloss: 0.363842	valid's Opt metric: 0.752
Early stopping, best iteration is:
[133]	valid's binary_logloss: 0.436013	valid's Opt metric: 0.770492
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.441841	valid's Opt metric: 0.727273
[200]	valid's binary_logloss: 0.383247	valid's Opt metric: 0.752
[300]	valid's binary_logloss: 0.361941	valid's Opt metric: 0.752
[400]	valid's binary_logloss: 0.350957	valid's Opt metric: 0.774194
[500]	valid's binary_logloss: 0.345778	valid's Opt metric: 0.774194
[600]	valid's binary_logloss: 0.34394	valid's Opt metric: 0.

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.44539	valid's Opt metric: 0.739496
[200]	valid's binary_logloss: 0.373879	valid's Opt metric: 0.747967
[300]	valid's binary_logloss: 0.349047	valid's Opt metric: 0.754098
[400]	valid's binary_logloss: 0.339706	valid's Opt metric: 0.784
[500]	valid's binary_logloss: 0.337821	valid's Opt metric: 0.80315
[600]	valid's binary_logloss: 0.338508	valid's Opt metric: 0.821705
[700]	valid's binary_logloss: 0.339816	valid's Opt metric: 0.821705
Early stopping, best iteration is:
[536]	valid's binary_logloss: 0.33619	valid's Opt metric: 0.8125
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	va

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.448131	valid's Opt metric: 0.733333
[200]	valid's binary_logloss: 0.379733	valid's Opt metric: 0.752
[300]	valid's binary_logloss: 0.355114	valid's Opt metric: 0.752
[400]	valid's binary_logloss: 0.346818	valid's Opt metric: 0.768
[500]	valid's binary_logloss: 0.341639	valid's Opt metric: 0.774194
[600]	valid's binary_logloss: 0.34497	valid's Opt metric: 0.774194
[700]	valid's binary_logloss: 0.344569	valid's Opt metric: 0.774194
Early stopping, best iteration is:
[506]	valid's binary_logloss: 0.341713	valid's Opt metric: 0.784
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's

Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.49535	valid's Opt metric: 0.706897
[200]	valid's binary_logloss: 0.437258	valid's Opt metric: 0.716667
[300]	valid's binary_logloss: 0.412687	valid's Opt metric: 0.731707
[400]	valid's binary_logloss: 0.399695	valid's Opt metric: 0.741935
[500]	valid's binary_logloss: 0.391267	valid's Opt metric: 0.752
[600]	valid's binary_logloss: 0.385674	valid's Opt metric: 0.752
[700]	valid's binary_logloss: 0.379767	valid's Opt metric: 0.761905
[800]	valid's binary_logloss: 0.374152	valid's Opt metric: 0.761905
Early stopping, best iteration is:
[630]	valid's binary_logloss: 0.384134	valid's Opt metric: 0.761905
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

T

[455]	valid's binary_logloss: 0.334293	valid's Opt metric: 0.793651
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.464513	valid's Opt metric: 0.711864
[200]	valid's binary_logloss: 0.403752	valid's Opt metric: 0.741935
[300]	valid's binary_logloss: 0.37725	valid's Opt metric: 0.752
[400]	valid's binary_logloss: 0.366509	valid's Opt metric: 0.758065
[500]	valid's binary_logloss: 0.359366	valid's Opt metric: 0.758065
Early stopping, best iteration is:
[301]	valid's binary_logloss: 0.377207	valid's Opt metric: 0.761905
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid

[200]	valid's binary_logloss: 0.374465	valid's Opt metric: 0.752
[300]	valid's binary_logloss: 0.356824	valid's Opt metric: 0.768
[400]	valid's binary_logloss: 0.348903	valid's Opt metric: 0.774194
[500]	valid's binary_logloss: 0.345403	valid's Opt metric: 0.764228
Early stopping, best iteration is:
[302]	valid's binary_logloss: 0.35616	valid's Opt metric: 0.784
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.452634	valid's Opt metric: 0.728814
[200]	valid's binary_logloss: 0.376124	valid's Opt metric: 0.758065
[300]	valid's binary_logloss: 0.348171	valid's Opt metric: 0.758065
Early stopping, best iteration is:
[131]	valid's binary_logloss: 0.421779	valid's Opt metric: 0.770492
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.445437	valid's Opt metric: 0.739496
[200]	valid's binary_logloss: 0.373546	valid's Opt metric: 0.758065
[300]	valid's binary_logloss: 0.348381	valid's Opt metric: 0.784
[400]	valid's binary_logloss: 0.338974	valid's Opt metric: 0.80315
[500]	valid's binary_logloss: 0.336072	valid's Opt metric: 0.8125
[600]	valid's binary_logloss: 0.336231	valid's Opt metric: 0.8125
[700]	valid's binary_logloss: 0.339239	valid's Opt metric: 0.8125
Early stopping, best iteration is:
[510]	valid's binary_logloss: 0.336122	valid's Opt metric: 0.821705
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.446642	valid's Opt metric: 0.739496
[200]	valid's binary_logloss: 0.375002	valid's Opt metric: 0


===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's binary_logloss: 0.446015	valid's Opt metric: 0.733333
[200]	valid's binary_logloss: 0.374363	valid's Opt metric: 0.758065
[300]	valid's binary_logloss: 0.348139	valid's Opt metric: 0.758065
[400]	valid's binary_logloss: 0.339344	valid's Opt metric: 0.784
[500]	valid's binary_logloss: 0.332126	valid's Opt metric: 0.793651
[600]	valid's binary_logloss: 0.330724	valid's Opt metric: 0.80315
[700]	valid's binary_logloss: 0.330611	valid's Opt metric: 0.80315
Early stopping, best iteration is:
[581]	valid's binary_logloss: 0.329295	valid's Opt metric: 0.80315
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.326029	valid's Opt m

In [47]:
test_pred[:5]

array([[0.15259145],
       [0.35175982],
       [0.10634716],
       [0.14114384],
       [0.7624237 ]], dtype=float32)

## Prepare submission for Kaggle

In [50]:
submission['Survived'] = (test_pred.data[:, 0] > 0.5).astype(int)

print(submission.head())

submission.to_csv('submission/automl_utilized_600_f1_score.csv', index = False)

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1


In [51]:
!head -n10 submission/automl_utilized_600_f1_score.csv

PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1
