# Train

In [1]:
import wandb
from paperswithtopic.config import load_config
from paperswithtopic.run import run

In [2]:
cfg = load_config()
cfg.use_saved = False
cfg.pre_embed = 'word2vec'
cfg.embed_dim = 512
cfg.hidden_dim = 512
cfg.use_bert_embed = False

cfg.model_name = 'albert'

In [3]:
wandb.login()
wandb.init(project='paperswithtopic', name='embeddim_512_albert_Test', tags=['word2vec-512'])

wandb: Currently logged in as: 1pha (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.33 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


In [None]:
run(cfg)

# Hyperparameter grid search with wandb.sweep

In [6]:
sweep_config = {
    'name': 'word2vec-rnn',
    'method': 'grid',
    'metric': {
        'name': 'valid_auc',
        'goal': 'maximize',
    },
    'parameters': {
        'model_name': {
            'values': [
                'rnn',
                'lstm
                'gru',

                'bert',
                'albert',
                'electra,

                'bertclassification',
                'albertclassification',
                'electraclassification',
            ]
        },
        'embed_dim': {
            'values': [256, 512]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project='paperswithtopic')

Create sweep with ID: gcoriitu
Sweep URL: https://wandb.ai/1pha/paperswithtopic/sweeps/gcoriitu


In [7]:
from IPython.display import clear_output
def run_sweep():
    
    with wandb.init(tags=['word2vec-rnn']):
        
        cfg = load_config()
        
        _cfg = wandb.config
        __cfg = dict(); __cfg.update(_cfg); cfg.update(__cfg)
        
        cfg.use_saved = False
        cfg.pre_embed = 'word2vec'
        cfg.hidden_dim = cfg.embed_dim
        cfg.use_bert_embed = False
        
        name = f'SWEEP_MD{cfg.model_name}_NL{cfg.n_layers}_EM{cfg.embed_dim}'
        
        wandb.run.name = name
        wandb.config.update(cfg)
        
        run(cfg)
        
        clear_output()

In [8]:
wandb.agent(sweep_id, function=run_sweep)

wandb: Sweep Agent: Waiting for job.
wandb: Sweep Agent: Exiting.


In [None]:
sweep_config = {
    'name': 'fasttext- ',
    'method': 'grid',
    'metric': {
        'name': 'valid_auc',
        'goal': 'maximize',
    },
    'parameters': {
        'embed_dim': {
            'values': [
                256, 512
            ]
        },
        
    }
}

sweep_id = wandb.sweep(sweep_config, project='paperswithtopic')

In [None]:
from IPython.display import clear_output
def run_sweep():
    
    with wandb.init(tags=['fasttext-512']):
        
        cfg = load_config()
        
        _cfg = wandb.config
        __cfg = dict(); __cfg.update(_cfg); cfg.update(__cfg)
        
        cfg.use_saved = False
        cfg.pre_embed = 'fasttext'
        cfg.embed_dim = 512
        cfg.hidden_dim = 512
        cfg.use_bert_embed = False
        
        name = f'SWEEP_PRE{cfg.model_name}'
        
        wandb.run.name = name
        wandb.config.update(cfg)
        
        run(cfg)
        
        clear_output()

In [None]:
wandb.agent(sweep_id, function=run_sweep)

In [2]:
sweep_config = {
    'name': 'word2vec-256',
    'method': 'grid',
    'metric': {
        'name': 'valid_auc',
        'goal': 'maximize',
    },
    'parameters': {
        'model_name': {
            'values': [
#                 'rnn',
#                 'lstm',
#                 'gru',

#                 'bert',
                'albert',
#                 'electra',

#                 'bertclassification',
                'albertclassification',
                'electraclassification',
            ]
        },
        
    }
}

sweep_id = wandb.sweep(sweep_config, project='paperswithtopic')

Create sweep with ID: rrdpg0x3
Sweep URL: https://wandb.ai/1pha/paperswithtopic/sweeps/rrdpg0x3


In [3]:
def run_sweep():
    
    with wandb.init(tags=['word2vec-256']):
        
        cfg = load_config()
        
        _cfg = wandb.config
        __cfg = dict(); __cfg.update(_cfg); cfg.update(__cfg)
        
        cfg.use_saved = False
        cfg.pre_embed = 'word2vec'
        cfg.embed_dim = 256
        cfg.hidden_dim = 256
        cfg.use_bert_embed = False
        
        name = f'SWEEP_PRE{cfg.model_name}'
        
        wandb.run.name = name
        wandb.config.update(cfg)
        
        run(cfg)
        
        clear_output()

In [4]:
wandb.agent(sweep_id, function=run_sweep)

wandb: Agent Starting Run: tiz0emz7 with config:
wandb: 	model_name: albert
wandb: Currently logged in as: 1pha (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.10.33 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade




Preprocess from zero-base.
[load_data] 0.2 sec 
There are 49980 papers.
[retrieve_raw_papers] 0.0 sec 
[remove_unknown] 0.3 sec 
[build_idx2word] 20.2 sec 
Use word2vec as embedding
[train_embed] 2.1 sec 
[embed_gensim] 11.7 sec 
[pp_pipeline] 34.6 sec 
NUM TRAIN 44982 | NUM VALID 4998
Use cuda:0 as a device.
Load Albert as model.
Epoch 1 / 100, BEST AUC 0.000
[train] 143.0 sec [valid] 5.3 sec 
TRAIN:: AUC 0.562 | LOSS 0.210
VALID:: AUC 0.616 | LOSS 0.195
saving model ...
Epoch 2 / 100, BEST AUC 0.616
[train] 143.0 sec [valid] 5.4 sec 
TRAIN:: AUC 0.631 | LOSS 0.192
VALID:: AUC 0.628 | LOSS 0.188
saving model ...
Epoch 3 / 100, BEST AUC 0.628
[train] 145.7 sec [valid] 5.4 sec 
TRAIN:: AUC 0.631 | LOSS 0.191
VALID:: AUC 0.624 | LOSS 0.190
saving model ...
Epoch 4 / 100, BEST AUC 0.628
[train] 143.3 sec [valid] 5.4 sec 
TRAIN:: AUC 0.639 | LOSS 0.191
VALID:: AUC 0.645 | LOSS 0.187
saving model ...
Epoch 5 / 100, BEST AUC 0.645
[train] 155.6 sec [valid] 5.9 sec 
TRAIN:: AUC 0.638 | LOSS 0



EarlyStopping counter: 10 out of 10


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_auc,0.68781
valid_auc,0.69024
train_loss,0.17444
valid_loss,0.17252
_runtime,4062.0
_timestamp,1625214174.0
_step,25.0


0,1
train_auc,▁▄▄▄▄▅▅▅▅▅▅▅▇█▇██▇▇▇▇▇▆▆▆▇
valid_auc,▁▂▁▃▂▄▄▅▄▃▃▅▆█▇██▆▇█▇▇▆▅▇▆
train_loss,█▅▅▅▅▅▅▄▅▄▅▄▂▁▁▁▁▂▂▂▂▂▃▃▃▂
valid_loss,█▇▇▆▆▆▆▆▆▆▆▂▂▁▁▁▂▂▂▂▂▃▃▄▃▃
_runtime,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
_timestamp,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██


Run tiz0emz7 errored: NameError("name 'clear_output' is not defined")
wandb: ERROR Run tiz0emz7 errored: NameError("name 'clear_output' is not defined")
wandb: Agent Starting Run: xp3aiok7 with config:
wandb: 	model_name: albertclassification
wandb: wandb version 0.10.33 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade




Preprocess from zero-base.
[load_data] 0.2 sec 
There are 49980 papers.
[retrieve_raw_papers] 0.0 sec 
[remove_unknown] 0.3 sec 
[build_idx2word] 23.1 sec 
Use word2vec as embedding
[train_embed] 2.3 sec 
[embed_gensim] 12.7 sec 
[pp_pipeline] 38.7 sec 
NUM TRAIN 44982 | NUM VALID 4998
Use cuda:0 as a device.
Load Albertclassification as model.
Epoch 1 / 100, BEST AUC 0.000
[train] 154.4 sec [valid] 6.3 sec 
TRAIN:: AUC 0.763 | LOSS 0.169
VALID:: AUC 0.835 | LOSS 0.149
saving model ...
Epoch 2 / 100, BEST AUC 0.835
[train] 152.2 sec [valid] 6.1 sec 
TRAIN:: AUC 0.836 | LOSS 0.148
VALID:: AUC 0.857 | LOSS 0.141
saving model ...
Epoch 3 / 100, BEST AUC 0.857
[train] 150.7 sec [valid] 6.1 sec 
TRAIN:: AUC 0.854 | LOSS 0.142
VALID:: AUC 0.868 | LOSS 0.136
saving model ...
Epoch 4 / 100, BEST AUC 0.868
[train] 150.4 sec [valid] 6.1 sec 
TRAIN:: AUC 0.855 | LOSS 0.141
VALID:: AUC 0.863 | LOSS 0.136
saving model ...
Epoch 5 / 100, BEST AUC 0.868
[train] 149.9 sec [valid] 5.9 sec 
TRAIN:: AUC 

[train] 151.9 sec [valid] 5.6 sec 
TRAIN:: AUC 0.925 | LOSS 0.111
VALID:: AUC 0.899 | LOSS 0.122
saving model ...




EarlyStopping counter: 10 out of 10


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_auc,0.92508
valid_auc,0.8987
train_loss,0.11085
valid_loss,0.1219
_runtime,8654.0
_timestamp,1625222834.0
_step,54.0


0,1
train_auc,▁▄▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▆▇▇▆▇▇▇▇▇▇▇▇▇▇▇█▇██████
valid_auc,▁▃▄▅▅▅▆▆▆▆▆▇▆▆▆▇▇▇▇▆▇▇██▇█▇▇█████▇▇█▇▇▇█
train_loss,█▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁
valid_loss,█▆▅▅▄▄▃▃▂▃▃▂▃▃▃▂▂▂▂▃▂▂▂▁▁▁▂▂▁▁▁▁▁▁▂▁▁▂▁▂
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███


Run xp3aiok7 errored: NameError("name 'clear_output' is not defined")
wandb: ERROR Run xp3aiok7 errored: NameError("name 'clear_output' is not defined")
wandb: Agent Starting Run: tcff6f5i with config:
wandb: 	model_name: electraclassification
wandb: wandb version 0.10.33 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade




Preprocess from zero-base.
[load_data] 0.2 sec 
There are 49980 papers.
[retrieve_raw_papers] 0.0 sec 
[remove_unknown] 0.3 sec 
[build_idx2word] 21.2 sec 
Use word2vec as embedding
[train_embed] 2.4 sec 
[embed_gensim] 14.0 sec 
[pp_pipeline] 38.2 sec 
NUM TRAIN 44982 | NUM VALID 4998
Use cuda:0 as a device.
Load Electraclassification as model.
Epoch 1 / 100, BEST AUC 0.000
[train] 46.8 sec [valid] 3.8 sec 
TRAIN:: AUC 0.789 | LOSS 0.162
VALID:: AUC 0.864 | LOSS 0.141
saving model ...
Epoch 2 / 100, BEST AUC 0.864
[train] 48.1 sec [valid] 3.8 sec 
TRAIN:: AUC 0.854 | LOSS 0.142
VALID:: AUC 0.877 | LOSS 0.132
saving model ...
Epoch 3 / 100, BEST AUC 0.877
[train] 46.0 sec [valid] 3.9 sec 
TRAIN:: AUC 0.868 | LOSS 0.137
VALID:: AUC 0.886 | LOSS 0.128
saving model ...
Epoch 4 / 100, BEST AUC 0.886
[train] 46.0 sec [valid] 3.9 sec 
TRAIN:: AUC 0.874 | LOSS 0.134
VALID:: AUC 0.891 | LOSS 0.126
saving model ...
Epoch 5 / 100, BEST AUC 0.891
[train] 42.0 sec [valid] 3.3 sec 
TRAIN:: AUC 0.88



[valid] 2.6 sec 
TRAIN:: AUC 0.934 | LOSS 0.106
VALID:: AUC 0.910 | LOSS 0.116
saving model ...
EarlyStopping counter: 10 out of 10


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train_auc,0.93398
valid_auc,0.91016
train_loss,0.10619
valid_loss,0.11586
_runtime,1444.0
_timestamp,1625224283.0
_step,34.0


0,1
train_auc,▁▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
valid_auc,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇▇██▇███▇██▇▇██▇█▇▇█
train_loss,█▆▅▅▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
valid_loss,█▆▅▄▃▄▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▂▂
_runtime,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███


Run tcff6f5i errored: NameError("name 'clear_output' is not defined")
wandb: ERROR Run tcff6f5i errored: NameError("name 'clear_output' is not defined")
wandb: Sweep Agent: Waiting for job.
wandb: Sweep Agent: Exiting.


# Manual Grid search without sweep

In [2]:
models = [
    'rnn',
    'electraclassification',
]

In [3]:
import os
from IPython.display import clear_output
# os.environ["WANDB_API_KEY"] = 'ba0fb28501407ce88345f540a04e9f47cfc57bda'
# os.environ["WANDB_MODE"] = "dryrun"

In [4]:
for model_name in models:
    
    cfg = load_config()

    cfg.use_saved = False
    cfg.pre_embed = 'word2vec'
    cfg.use_bert_embed = False

    cfg.model_name = model_name

    name = f'SWEEP_MD{cfg.model_name}'
    
    wandb.init(project='paperswithtopic', tags=['word2vec'])
    wandb.run.name = name    

    run(cfg)
    clear_output()