# Загрузка библиотек

In [None]:
!pip install -U lightautoml==0.3.8b1



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Импорты

In [None]:
import pandas as pd
import numpy as np
import shutil
import pickle
import string
import ast

import torch
import torch.nn.functional as F

from sklearn.metrics import f1_score, log_loss
from sklearn.model_selection import train_test_split

from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
N_THREADS = 4
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 3600

# Данные

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/automl/train_new.csv')

In [None]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [None]:
del train_data['Unnamed: 0']

In [None]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


Маппинг классов 'discourse_effectiveness'

In [None]:
label_mapping = {'Effective': 2, 'Adequate': 1, 'Ineffective': 0}
train_data['discourse_effectiveness_map'] = train_data['discourse_effectiveness'].map(label_mapping)

# label_mapping_2 = {'Claim': 6, 'Concluding Statement': 5, 'Counterclaim': 4, 'Evidence': 3, 'Lead': 2, 'Position': 1, 'Rebuttal': 0}
# train_data['discourse_type_map'] = train_data['discourse_type'].map(label_mapping_2)

In [None]:
train_data.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_effectiveness_map
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,1
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,1
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,1
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,1
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,1


In [None]:
train, test = train_test_split(
    train_data,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE
)

In [None]:
print(train.shape)
print(test.shape)

(29352, 6)
(7339, 6)


# AutoML

## Линейная модель и LightGBM (pooled_bert)

In [None]:
roles = {
    'target': 'discourse_effectiveness_map',
    'text': ['discourse_text'],
    'drop' : ['discourse_id', 'essay_id', 'discourse_type', 'discourse_effectiveness']
}

task = Task('multiclass', metric = 'auc')

In [None]:
automl = TabularNLPAutoML(task = task,
                          timeout = 3600,
                          cpu_limit=1,
                          gpu_ids='0',
                          general_params = {'use_algos': ['linear_l2', 'lgb']},
                          text_params = {'lang': 'en'},
                          autonlp_params={'model_name': 'pooled_bert'}
                          )

In [None]:
%%time

oof_pred = automl.fit_predict(train, roles=roles)
not_nan = np.any(~np.isnan(oof_pred.data), axis=1)
print('OOF score: {}'.format(log_loss(train[roles['target']].values[not_nan], oof_pred.data[not_nan])))

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO3:lightautoml.automl.presets.text_presets:Model language mode: en
INFO:lightautoml.automl.presets.base:Task: multiclass

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 3600.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 1 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (29352, 6)[0m

INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 3599.96 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [], 'embed_sizes': (), 'data_size': 100}
INFO2:lightautoml.ml_algo

OOF score: 0.8805149426036419
CPU times: user 3min 53s, sys: 9.82 s, total: 4min 3s
Wall time: 4min 5s


In [None]:
%%time

test_pred = automl.predict(test)
print('TEST score: {}'.format(log_loss(test[roles['target']].values, test_pred.data)))

INFO3:lightautoml.transformers.text:Feature concated__discourse_text transformed


TEST score: 0.8487938835131061
CPU times: user 16.3 s, sys: 1.07 s, total: 17.3 s
Wall time: 17.5 s


## Линейная модель и CatBoost (Random LSTM)

In [None]:
roles = {
    'target': 'discourse_effectiveness_map',
    'text': ['discourse_text'],
    'drop' : ['discourse_id', 'essay_id', 'discourse_type', 'discourse_effectiveness']
}

task = Task('multiclass', metric = 'auc')

In [None]:
automl = TabularNLPAutoML(
    task = task,
    timeout = 3600,
    cpu_limit=1,
    gpu_ids='0',
    reader_params={
        'cv': 5,
        'random_state': 42
        },
    general_params = {'use_algos': ['linear_l2', 'cb', 'cb_tuned']},
    text_params = {'lang': 'en'},
    autonlp_params={
        'model_name': 'random_lstm'}
    )

In [None]:
%%time

oof_pred = automl.fit_predict(train, roles=roles)
not_nan = np.any(~np.isnan(oof_pred.data), axis=1)
print('OOF score: {}'.format(log_loss(train[roles['target']].values[not_nan], oof_pred.data[not_nan])))

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO3:lightautoml.automl.presets.text_presets:Model language mode: en
INFO:lightautoml.automl.presets.base:Task: multiclass

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 3600.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 1 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (29352, 6)[0m

INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 3599.94 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [], 'embed_sizes': (), 'data_size': 100}
INFO2:lightautoml.ml_algo

OOF score: 0.8647609324730038
CPU times: user 14min 24s, sys: 19 s, total: 14min 43s
Wall time: 14min 32s


In [None]:
%%time

test_pred = automl.predict(test)
print('TEST score: {}'.format(log_loss(test[roles['target']].values, test_pred.data)))

INFO3:lightautoml.transformers.text:Feature concated__discourse_text transformed
INFO3:lightautoml.transformers.text:Feature concated__discourse_text transformed


TEST score: 0.8478953478394771
CPU times: user 30.4 s, sys: 2.07 s, total: 32.5 s
Wall time: 32.7 s


## Bert

In [None]:
roles = {
    'target': 'discourse_effectiveness_map',
    'text': ['discourse_text'],
    'drop' : ['discourse_id', 'essay_id', 'discourse_type']
}

task = Task('multiclass', metric = 'auc')

In [None]:
automl = TabularNLPAutoML(
    task=task,
    timeout=TIMEOUT,
    cpu_limit=1,
    gpu_ids='0',
    general_params={
        'nested_cv': False,
        'use_algos': [['nn']],
        'n_folds': 3
    },
    reader_params={
        'cv': 3
    },
    autonlp_params={
        'sent_scaler': 'l2'
    },
    text_params={
        'lang': 'en',
        'bert_model': 'bert-base-uncased'
    },
    nn_params={
        'opt_params': {'lr': 1e-5},
        'max_length': 128,
        'bs': 32,
        'n_epochs': 7,
    },
)

In [None]:
%%time

oof_pred = automl.fit_predict(train, roles = roles, verbose=2)
not_nan = np.any(~np.isnan(oof_pred.data), axis=1)
print('OOF score: {}'.format(log_loss(train[roles['target']].values[not_nan], oof_pred.data[not_nan])))

[20:09:45] Stdout logging level is INFO2.


INFO:lightautoml.automl.presets.base:Stdout logging level is INFO2.
INFO3:lightautoml.automl.presets.text_presets:Model language mode: en


[20:09:45] Task: multiclass



INFO:lightautoml.automl.presets.base:Task: multiclass



[20:09:45] Start automl preset with listed constraints:


INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:


[20:09:45] - time: 3600.00 seconds


INFO:lightautoml.automl.presets.base:- time: 3600.00 seconds


[20:09:45] - CPU: 1 cores


INFO:lightautoml.automl.presets.base:- CPU: 1 cores


[20:09:45] - memory: 16 GB



INFO:lightautoml.automl.presets.base:- memory: 16 GB



[20:09:45] [1mTrain data shape: (29352, 6)[0m



INFO:lightautoml.reader.base:[1mTrain data shape: (29352, 6)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []


[20:09:46] Layer [1m1[0m train process start. Time left 3599.71 secs


INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 3599.71 secs
DEBUG:lightautoml.ml_algo.dl_model:number of text features: 1 
DEBUG:lightautoml.ml_algo.dl_model:number of categorical features: 1 
DEBUG:lightautoml.ml_algo.dl_model:number of continuous features: 0 


[20:09:46] Start fitting [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'num_workers': 1, 'pin_memory': False, 'max_length': 128, 'is_snap': False, 'input_bn': False, 'max_emb_size': 50, 'bert_name': 'bert-base-uncased', 'pooling': 'cls', 'device': device(type='cpu'), 'use_cont': True, 'use_cat': True, 'use_text': True, 'lang': 'en', 'deterministic': False, 'multigpu': False, 'random_state': 42, 'model': '_linear_layer', 'model_with_emb': False, 'path_to_save': None, 'verbose_inside': None, 'verbose': 1, 'n_epochs': 7, 'snap_params': {'k': 1, 'early_stopping': True, 'patience': 1, 'swa': False}, 'bs': 32, 'emb_dropout': 0.1, 'emb_ratio': 3, 'opt': <class 'torch.optim.adam.Adam'>, 'opt_params': {'lr': 1e-05}, 'sch': <class 'torch.optim.lr_scheduler.ReduceLROnPlateau'>, 'scheduler_params': {'patience': 5, 'factor': 0.5, 'verbose': True}, 'loss': None, 'loss_params': {}, 'loss_on_logits': True, 'clip_grad': False,

[20:09:46] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m =====


INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m =====
INFO3:lightautoml.text.trainer:Epoch: 0, train loss: 0.973159909248352, val loss: 0.9710264801979065, val metric: 0.8965873409353825
INFO3:lightautoml.text.trainer:Epoch: 1, train loss: 0.9688310027122498, val loss: 0.9661142230033875, val metric: 0.8965873409353825
INFO3:lightautoml.text.trainer:Epoch: 2, train loss: 0.9638194441795349, val loss: 0.9608559012413025, val metric: 0.8965873409353825
INFO3:lightautoml.text.trainer:Epoch: 3, train loss: 0.9589090943336487, val loss: 0.9557074308395386, val metric: 0.8965873409353825
INFO3:lightautoml.text.trainer:Epoch: 4, train loss: 0.9534866213798523, val loss: 0.9505415558815002, val metric: 0.8965873409353825
INFO3:lightautoml.text.trainer:Epoch: 5, train loss: 0.9476910829544067, val loss: 0.9440057277679443, val metric: 0.8965873409353825
INFO3:lightautoml.text.trainer:Epoch: 6, train loss: 0.9418937563

[20:10:23] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m =====


INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m =====
INFO3:lightautoml.text.trainer:Epoch: 0, train loss: 0.9731162190437317, val loss: 0.971038818359375, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 1, train loss: 0.9688265919685364, val loss: 0.9661915302276611, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 2, train loss: 0.9637871980667114, val loss: 0.9609505534172058, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 3, train loss: 0.9588790535926819, val loss: 0.9551640152931213, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 4, train loss: 0.9533060193061829, val loss: 0.950210690498352, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 5, train loss: 0.9477611780166626, val loss: 0.9446508884429932, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 6, train loss: 0.94189924001

[20:10:58] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m =====


INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m =====
INFO3:lightautoml.text.trainer:Epoch: 0, train loss: 0.9731815457344055, val loss: 0.9710777997970581, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 1, train loss: 0.9686049818992615, val loss: 0.9660558700561523, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 2, train loss: 0.9638374447822571, val loss: 0.9609413743019104, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 3, train loss: 0.9587775468826294, val loss: 0.9557225108146667, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 4, train loss: 0.9531343579292297, val loss: 0.9505216479301453, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 5, train loss: 0.947954535484314, val loss: 0.9446249604225159, val metric: 0.8966158423205653
INFO3:lightautoml.text.trainer:Epoch: 6, train loss: 0.9421055316

[20:11:34] Fitting [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m finished. score = [1m0.8966063410737405[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m finished. score = [1m0.8966063410737405[0m


[20:11:34] [1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0[0m fitting and predicting completed


[20:11:34] Time left 3491.48 secs



INFO:lightautoml.automl.base:Time left 3491.48 secs



[20:11:34] [1mLayer 1 training completed.[0m



INFO:lightautoml.automl.base:[1mLayer 1 training completed.[0m



[20:11:34] [1mAutoml preset training completed in 108.53 seconds[0m



INFO:lightautoml.automl.presets.base:[1mAutoml preset training completed in 108.53 seconds[0m



[20:11:34] Model description:
Final prediction for new objects (level 0) = 
	 1.00000 * (3 averaged models Lvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0) 



INFO:lightautoml.automl.presets.base:Model description:
Final prediction for new objects (level 0) = 
	 1.00000 * (3 averaged models Lvl_0_Pipe_0_Mod_0_TorchNN__linear_layer_0) 



OOF score: 0.9388695507554953
CPU times: user 1min 7s, sys: 11.9 s, total: 1min 19s
Wall time: 1min 48s


In [None]:
test_pred = automl.predict(test)
print('TEST score: {}'.format(log_loss(test[roles['target']].values, test_pred.data)))

TEST score: 0.934655221674981
