In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import os
import random
import re
import time
from collections import Counter
from itertools import chain
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.utils import shuffle
from torch import optim
from torch.utils.data import Dataset, Sampler, DataLoader
from tqdm import tqdm

In [3]:
# constants
embedding_glove = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
embedding_fasttext = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
embedding_para = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
embedding_w2v = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
train_path = '../input/train.csv'
test_path = '../input/test.csv'

mispell_dict = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because",
                "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not",
                "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                "he'd": "he would", "he'll": "he will", "he's": "he is", "how'd": "how did",
                "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "i'd": "i would",
                "i'd've": "i would have", "i'll": "i will", "i'll've": "I will have", "i'm": "i am",
                "i've": "I have", "isn't": "is not", "it'd": "it would",
                "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
                "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
                "mightn't": "might not", "mightn't've": "might not have", "must've": "must have",
                "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
                "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
                "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
                "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
                "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                "so've": "so have", "so's": "so as", "this's": "this is", "that'd": "that would",
                "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                "there'd've": "there would have", "there's": "there is", "here's": "here is",
                "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
                "they'll've": "they will have", "they're": "they are", "they've": "they have",
                "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
                "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
                "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
                "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
                "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have",
                "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not",
                "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
                "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
                "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
                "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color',
                'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling',
                'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize',
                'youtu ': 'youtube ', 'qoura': 'quora', 'sallary': 'salary', 'whta': 'what',
                'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doi': 'do I',
                'thebest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation',
                'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis',
                'etherium': 'ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017',
                '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess',
                "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
                'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

puncts = '\'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2",
                 "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '”': '"', '“': '"', "£": "e",
                 '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta',
                 '∅': '', '³': '3', 'π': 'pi', '\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}
for p in puncts:
    punct_mapping[p] = ' %s ' % p

p = re.compile('(\[ math \]).+(\[ / math \])')
p_space = re.compile(r'[^\x20-\x7e]')

In [4]:
#  seeding functions
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed + 1)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed + 2)
    random.seed(seed + 4)

In [5]:
#data loading & pre-processing

def clean_text(text):
    # clean latex maths
    text = p.sub(' [ math ] ', text)
    # clean invisible chars
    text = p_space.sub(r'', text)
    # clean punctuations
    for punct in punct_mapping:
        if punct in text:
            text = text.replace(punct, punct_mapping[punct])
    tokens = []
    for token in text.split():
        # replace contractions & correct misspells
        token = mispell_dict.get(token.lower(), token)
        tokens.append(token)
    text = ' '.join(tokens)
    return text

def load_data(train_path=train_path, test_path=test_path, debug=False):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    if debug:
        train_df = train_df[:10000]
        test_df = test_df[:10000]
    s = time.time()
    train_df['question_text'] = train_df['question_text'].apply(clean_text)
    test_df['question_text'] = test_df['question_text'].apply(clean_text)
    print('preprocssing {}s'.format(time.time() - s))
    return train_df, test_df

In [6]:
# vocabulary functions
def build_counter(sents, splited=False):
    counter = Counter()
    for sent in tqdm(sents, ascii=True, desc='building conuter'):
        if splited:
            counter.update(sent)
        else:
            counter.update(sent.split())
    return counter


def build_vocab(counter, max_vocab_size):
    vocab = {'token2id': {'<PAD>': 0, '<UNK>': max_vocab_size + 1}}
    vocab['token2id'].update(
        {token: _id + 1 for _id, (token, count) in
         tqdm(enumerate(counter.most_common(max_vocab_size)), desc='building vocab')})
    vocab['id2token'] = {v: k for k, v in vocab['token2id'].items()}
    return vocab

def tokens2ids(tokens, token2id):
    seq = []
    for token in tokens:
        token_id = token2id.get(token, len(token2id) - 1)
        seq.append(token_id)
    return seq

#  data set
class TextDataset(Dataset):
    def __init__(self, df, vocab=None, num_max=None, max_seq_len=100,
                 max_vocab_size=95000):
        if num_max is not None:
            df = df[:num_max]

        self.src_sents = df['question_text'].tolist()
        self.qids = df['qid'].values
        if vocab is None:
            src_counter = build_counter(self.src_sents)
            vocab = build_vocab(src_counter, max_vocab_size)
        self.vocab = vocab
        if 'src_seqs' not in df.columns:
            self.src_seqs = []
            for sent in tqdm(self.src_sents, desc='tokenize'):
                seq = tokens2ids(sent.split()[:max_seq_len], vocab['token2id'])
                self.src_seqs.append(seq)
        else:
            self.src_seqs = df['src_seqs'].tolist()
        if 'target' in df.columns:
            self.targets = df['target'].values
        else:
            self.targets = np.random.randint(2, size=(len(self.src_sents),))
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.src_sents)

    # for bucket iterator
    def get_keys(self):
        lens = np.fromiter(
            tqdm(((min(self.max_seq_len, len(c.split()))) for c in self.src_sents), desc='generate lens'),
            dtype=np.int32)
        return lens

    def __getitem__(self, index):
        return self.qids[index], self.src_sents[index], self.src_seqs[index], self.targets[index]

In [7]:
#  dynamic padding
def _pad_sequences(seqs):
    lens = [len(seq) for seq in seqs]
    max_len = max(lens)

    padded_seqs = torch.zeros(len(seqs), max_len).long()
    for i, seq in enumerate(seqs):
        end = lens[i]
        padded_seqs[i, :end] = torch.LongTensor(seq)
    return padded_seqs, lens


def collate_fn(data):
    qids, src_sents, src_seqs, targets, = zip(*data)
    src_seqs, src_lens = _pad_sequences(src_seqs)
    return qids, src_sents, src_seqs, src_lens, torch.FloatTensor(targets)


#  bucket iterator
def divide_chunks(l, n):
    if n == len(l):
        yield np.arange(len(l), dtype=np.int32), l
    else:
        # looping till length l
        for i in range(0, len(l), n):
            data = l[i:i + n]
            yield np.arange(i, i + len(data), dtype=np.int32), data


def prepare_buckets(lens, bucket_size, batch_size, shuffle_data=True, indices=None):
    lens = -lens
    assert bucket_size % batch_size == 0 or bucket_size == len(lens)
    if indices is None:
        if shuffle_data:
            indices = shuffle(np.arange(len(lens), dtype=np.int32))
            lens = lens[indices]
        else:
            indices = np.arange(len(lens), dtype=np.int32)
    new_indices = []
    extra_batch = None
    for chunk_index, chunk in (divide_chunks(lens, bucket_size)):
        # sort indices in bucket by descending order of length
        indices_sorted = chunk_index[np.argsort(chunk, axis=-1)]
        batches = []
        for _, batch in divide_chunks(indices_sorted, batch_size):
            if len(batch) == batch_size:
                batches.append(batch.tolist())
            else:
                assert extra_batch is None
                assert batch is not None
                extra_batch = batch
        # shuffling batches within buckets
        if shuffle_data:
            batches = shuffle(batches)
        for batch in batches:
            new_indices.extend(batch)

    if extra_batch is not None:
        new_indices.extend(extra_batch)
    return indices[new_indices]


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_keys, bucket_size=None, batch_size=1536, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_keys = sort_keys
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_keys)
        if not shuffle_data:
            self.index = prepare_buckets(self.sort_keys, bucket_size=self.bucket_size, batch_size=self.batch_size,
                                         shuffle_data=self.shuffle)
        else:
            self.index = None
        self.weights = None

    def set_weights(self, w):
        assert w >= 0
        total = np.sum(w)
        if total != 1:
            w = w / total
        self.weights = w

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_keys)

            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = prepare_buckets(self.sort_keys, bucket_size=self.bucket_size, batch_size=self.batch_size,
                                         shuffle_data=self.shuffle, indices=indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_keys)

In [8]:
# embedding stuffs
def read_embedding(embedding_file):
    """
    read embedding file into a dictionary
    each line of the embedding file should in the format like  word 0.13 0.22 ... 0.44
    :param embedding_file: path of the embedding.
    :return: a dictionary of word to its embedding (numpy array)
    """
    if os.path.basename(embedding_file) != 'wiki-news-300d-1M.vec':
        skip_head = None
    else:
        skip_head = 0
    if os.path.basename(embedding_file) == 'paragram_300_sl999.txt':
        encoding = 'latin'
    else:
        encoding = 'utf-8'
    embeddings_index = {}
    t_chunks = pd.read_csv(embedding_file, index_col=0, skiprows=skip_head, encoding=encoding, sep=' ', header=None,
                           quoting=3,
                           doublequote=False, quotechar=None, engine='c', na_filter=False, low_memory=True,
                           chunksize=10000)
    for t in t_chunks:
        for k, v in zip(t.index.values, t.values):
            embeddings_index[k] = v.astype(np.float32)
    return embeddings_index


def get_emb(embedding_index, word, word_raw):
    if word == word_raw:
        return None
    else:
        return embedding_index.get(word, None)


def embedding2numpy(embedding_path, word_index, num_words, embed_size, emb_mean=0., emb_std=0.5,
                    report_stats=False):
    embedding_index = read_embedding(embedding_path)
    num_words = min(num_words + 2, len(word_index))
    if report_stats:
        all_coefs = []
        for v in embedding_index.values():
            all_coefs.append(v.reshape([-1, 1]))
        all_coefs = np.concatenate(all_coefs)
        print(all_coefs.mean(), all_coefs.std(), np.linalg.norm(all_coefs, axis=-1).mean())
    embedding_matrix = np.zeros((num_words, embed_size), dtype=np.float32)
    oov = 0
    oov_cap = 0
    oov_upper = 0
    oov_lower = 0
    for word, i in word_index.items():
        if i == 0:  # padding
            continue
        if i >= num_words:
            continue
        embedding_vector = embedding_index.get(word, None)
        if embedding_vector is None:
            embedding_vector = get_emb(embedding_index, word.lower(), word)
            if embedding_vector is None:
                embedding_vector = get_emb(embedding_index, word.upper(), word)
                if embedding_vector is None:
                    embedding_vector = get_emb(embedding_index, word.capitalize(), word)
                    if embedding_vector is None:
                        oov += 1
                        # embedding_vector = (np.zeros((1, embed_size)))
                        embedding_vector = np.random.normal(emb_mean, emb_std, size=(1, embed_size))
                    else:
                        oov_lower += 1
                else:
                    oov_upper += 1
            else:
                oov_cap += 1

        embedding_matrix[i] = embedding_vector

    print('oov %d/%d/%d/%d/%d' % (oov, oov_cap, oov_upper, oov_lower, len(word_index)))
    return embedding_matrix


def load_embedding(vocab, max_vocab_size, embed_size):
    # load embedding
    embedding_matrix1 = embedding2numpy(embedding_glove, vocab['token2id'], max_vocab_size, embed_size,
                                        emb_mean=-0.005838499, emb_std=0.48782197, report_stats=False)
    # -0.005838499 0.48782197 0.37823704
    # oov 9196
    # embedding_matrix2 = embedding2numpy(embedding_fasttext, vocab.token2id, max_vocab_size, embed_size,
    #                                    report_stats=False, emb_mean=-0.0033469985, emb_std=0.109855495, )
    # -0.0033469985 0.109855495 0.07475414
    # oov 12885
    embedding_matrix2 = embedding2numpy(embedding_para, vocab['token2id'], max_vocab_size, embed_size,
                                        emb_mean=-0.0053247833, emb_std=0.49346462, report_stats=False)
    # -0.0053247833 0.49346462 0.3828983
    # oov 9061
    # embedding_w2v
    # -0.003527845 0.13315111 0.09407869
    # oov 18927
    return [embedding_matrix1, embedding_matrix2]

In [9]:
# cyclic learning rate
def set_lr(optimizer, lr):
    for g in optimizer.param_groups:
        g['lr'] = lr


class CyclicLR:
    def __init__(self, optimizer, base_lr=0.001, max_lr=0.002, step_size=300., mode='triangular',
                 gamma=0.99994, scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()
        self.optimizer = optimizer
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1 / (2. ** (x - 1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma ** x
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}
        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        if new_base_lr is not None:
            self.base_lr = new_base_lr
        if new_max_lr is not None:
            self.max_lr = new_max_lr
        if new_step_size is not None:
            self.step_size = new_step_size
        self.clr_iterations = 0.

    def clr(self):
        cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))
        x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(
                self.clr_iterations)

    def on_train_begin(self):
        if self.clr_iterations == 0:
            set_lr(self.optimizer, self.base_lr)
        else:
            set_lr(self.optimizer, self.clr())

    def on_batch_end(self):
        self.trn_iterations += 1
        self.clr_iterations += 1
        set_lr(self.optimizer, self.clr())

In [10]:
# model

class Capsule(nn.Module):
    def __init__(self, input_dim_capsule=1024, num_capsule=5, dim_capsule=5, routings=4):
        super(Capsule, self).__init__()
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.activation = self.squash
        self.W = nn.Parameter(
            nn.init.xavier_normal_(torch.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))

    def forward(self, x):
        u_hat_vecs = torch.matmul(x, self.W)
        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1,
                                        3).contiguous()  # (batch_size,num_capsule,input_num_capsule,dim_capsule)
        with torch.no_grad():
            b = torch.zeros_like(u_hat_vecs[:, :, :, 0])
        for i in range(self.routings):
            c = torch.nn.functional.softmax(b, dim=1)  # (batch_size,num_capsule,input_num_capsule)
            outputs = self.activation(torch.sum(c.unsqueeze(-1) * u_hat_vecs, dim=2))  # bij,bijk->bik
            if i < self.routings - 1:
                b = (torch.sum(outputs.unsqueeze(2) * u_hat_vecs, dim=-1))  # bik,bijk->bij
        return outputs  # (batch_size, num_capsule, dim_capsule)

    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = torch.sqrt(s_squared_norm + 1e-7)
        return x / scale


#  model
class Attention(nn.Module):
    def __init__(self, feature_dim, max_seq_len=70):
        super().__init__()
        self.attention_fc = nn.Linear(feature_dim, 1)
        self.bias = nn.Parameter(torch.zeros(1, max_seq_len, 1, requires_grad=True))

    def forward(self, rnn_output):
        """
        forward attention scores and attended vectors
        :param rnn_output: (#batch,#seq_len,#feature)
        :return: attended_outputs (#batch,#feature)
        """
        attention_weights = self.attention_fc(rnn_output)
        seq_len = rnn_output.size(1)
        attention_weights = self.bias[:, :seq_len, :] + attention_weights
        attention_weights = torch.tanh(attention_weights)
        attention_weights = torch.exp(attention_weights)
        attention_weights_sum = torch.sum(attention_weights, dim=1, keepdim=True) + 1e-7
        attention_weights = attention_weights / attention_weights_sum
        attended = torch.sum(attention_weights * rnn_output, dim=1)
        return attended


class InsincereModel(nn.Module):
    def __init__(self, device, hidden_dim, hidden_dim_fc, embedding_matrixs, vocab_size=None, embedding_dim=None,
                 dropout=0.1, num_capsule=5, dim_capsule=5, capsule_out_dim=1, alpha=0.8, beta=0.8,
                 finetuning_vocab_size=120002,
                 embedding_mode='mixup', max_seq_len=70):
        super(InsincereModel, self).__init__()
        self.beta = beta
        self.embedding_mode = embedding_mode
        self.finetuning_vocab_size = finetuning_vocab_size
        self.alpha = alpha
        vocab_size, embedding_dim = embedding_matrixs[0].shape
        self.raw_embedding_weights = embedding_matrixs
        self.embedding_0 = nn.Embedding(vocab_size, embedding_dim, padding_idx=0).from_pretrained(
            torch.from_numpy(embedding_matrixs[0]))
        self.embedding_1 = nn.Embedding(vocab_size, embedding_dim, padding_idx=0).from_pretrained(
            torch.from_numpy(embedding_matrixs[1]))
        self.embedding_mean = nn.Embedding(vocab_size, embedding_dim, padding_idx=0).from_pretrained(
            torch.from_numpy((embedding_matrixs[0] + embedding_matrixs[1]) / 2))
        self.learnable_embedding = nn.Embedding(finetuning_vocab_size, embedding_dim, padding_idx=0)
        nn.init.constant_(self.learnable_embedding.weight, 0)
        self.learn_embedding = False
        self.spatial_dropout = nn.Dropout2d(p=0.2)
        self.device = device
        self.hidden_dim = hidden_dim
        self.rnn0 = nn.LSTM(embedding_dim, int(hidden_dim / 2), num_layers=1, bidirectional=True, batch_first=True)
        self.rnn1 = nn.GRU(hidden_dim, int(hidden_dim / 2), num_layers=1, bidirectional=True, batch_first=True)
        self.capsule = Capsule(input_dim_capsule=self.hidden_dim, num_capsule=num_capsule, dim_capsule=dim_capsule)
        self.dropout2 = nn.Dropout(0.3)
        self.lincaps = nn.Linear(num_capsule * dim_capsule, capsule_out_dim)
        self.attention1 = Attention(self.hidden_dim, max_seq_len=max_seq_len)
        self.attention2 = Attention(self.hidden_dim, max_seq_len=max_seq_len)
        self.fc = nn.Linear(hidden_dim * 4 + capsule_out_dim, hidden_dim_fc)
        self.norm = torch.nn.LayerNorm(hidden_dim * 4 + capsule_out_dim)
        self.dropout1 = nn.Dropout(0.2)
        self.dropout_linear = nn.Dropout(p=dropout)
        self.hidden2out = nn.Linear(hidden_dim_fc, 1)

    def set_embedding_mode(self, embedding_mode):
        self.embedding_mode = embedding_mode

    def enable_learning_embedding(self):
        self.learn_embedding = True

    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def apply_spatial_dropout(self, emb):
        emb = emb.permute(0, 2, 1).unsqueeze(-1)
        emb = self.spatial_dropout(emb).squeeze(-1).permute(0, 2, 1)
        return emb

    def forward(self, seqs, lens, return_logits=True):
        # forward embeddings
        if self.embedding_mode == 'mixup':
            emb0 = self.embedding_0(seqs)  # batch_size x seq_len x embedding_dim
            emb1 = self.embedding_1(seqs)
            prob = np.random.beta(self.alpha, self.beta, size=(seqs.size(0), 1, 1)).astype(np.float32)
            prob = torch.from_numpy(prob).to(self.device)
            emb = emb0 * prob + emb1 * (1 - prob)
        elif self.embedding_mode == 'emb0':
            emb = self.embedding_0(seqs)
        elif self.embedding_mode == 'emb1':
            emb = self.embedding_1(seqs)
        elif self.embedding_mode == 'mean':
            emb = self.embedding_mean(seqs)
        else:
            assert False
        if self.learn_embedding:
            seq_clamped = torch.clamp(seqs, 0, self.finetuning_vocab_size - 1)
            emb_learned = self.learnable_embedding(seq_clamped)
            emb = emb + emb_learned
        emb = self.apply_spatial_dropout(emb)
        # forward rnn encoder
        lstm_output0, _ = self.rnn0(emb)
        lstm_output1, _ = self.rnn1(lstm_output0)
        # forward capsule
        content3 = self.capsule(lstm_output1)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.dropout2(content3)
        content3 = torch.relu(self.lincaps(content3))
        # forward feature extractor
        feature_att1 = self.attention1(lstm_output0)
        feature_att2 = self.attention2(lstm_output1)
        feature_avg2 = torch.mean(lstm_output1, dim=1)
        feature_max2, _ = torch.max(lstm_output1, dim=1)
        feature = torch.cat((feature_att1, feature_att2, feature_avg2, feature_max2, content3), dim=-1)
        feature = self.norm(feature)
        feature = self.dropout1(feature)
        feature = torch.relu(feature)
        # forward dense layer
        out = self.fc(feature)
        out = self.dropout_linear(out)
        out = self.hidden2out(out)  # batch_size x 1
        if not return_logits:
            out = torch.sigmoid(out)
        return out

In [11]:
#  util functions
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def margin_score(targets, predictions):
    return ((targets == 1) * (1 - predictions) + (targets == 0) * (predictions)).mean()


def report_perf(valid_dataset, predictions_va, threshold, idx, epoch_cur, desc='val set'):
    val_f1 = f1_score(valid_dataset.targets, predictions_va > threshold)
    val_auc = roc_auc_score(valid_dataset.targets, predictions_va)
    val_margin = margin_score(valid_dataset.targets, predictions_va)
    print('idx {} epoch {} {} f1 : {:.4f} auc : {:.4f} margin : {:.4f}'.format(
        idx,
        epoch_cur,
        desc,
        val_f1,
        val_auc,
        val_margin))


def get_gpu_memory_usage(device_id):
    return round(torch.cuda.max_memory_allocated(device_id) / 1000 / 1000)


def avg(loss_list):
    if len(loss_list) == 0:
        return 0
    else:
        return sum(loss_list) / len(loss_list)

In [12]:
# evaluation
def eval_model(model, data_iter, device, order_index=None):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch_data in data_iter:
            qid_batch, src_sents, src_seqs, src_lens, tgts = batch_data
            src_seqs = src_seqs.to(device)
            out = model(src_seqs, src_lens, return_logits=False)
            predictions.append(out)
    predictions = torch.cat(predictions, dim=0)
    if order_index is not None:
        predictions = predictions[order_index]
    predictions = predictions.to('cpu').numpy().ravel()
    return predictions

In [13]:
# cross validation

def cv(train_df, test_df, device=None, n_folds=10, shared_resources=None, share=True, **kwargs):
    if device is None:
        device = torch.device("cuda:{}".format(0) if torch.cuda.is_available() else "cpu")
    max_vocab_size = kwargs['max_vocab_size']
    embed_size = kwargs['embed_size']
    threshold = kwargs['threshold']
    max_seq_len = kwargs['max_seq_len']
    if shared_resources is None:
        shared_resources = {}
    if share:
        if 'vocab' not in shared_resources:
            # also include the test set

            counter = build_counter(chain(train_df['question_text'], test_df['question_text']))
            vocab = build_vocab(counter, max_vocab_size=max_vocab_size)
            shared_resources['vocab'] = vocab
            # tokenize sentences
            seqs = []
            for sent in tqdm(train_df['question_text'], desc='tokenize'):
                seq = tokens2ids(sent.split()[:max_seq_len], vocab['token2id'])
                seqs.append(seq)
            train_df['src_seqs'] = seqs
            seqs = []
            for sent in tqdm(test_df['question_text'], desc='tokenize'):
                seq = tokens2ids(sent.split()[:max_seq_len], vocab['token2id'])
                seqs.append(seq)
            test_df['src_seqs'] = seqs
    if 'embedding_matrix' not in shared_resources:
        embedding_matrix = load_embedding(shared_resources['vocab'], max_vocab_size, embed_size)
        shared_resources['embedding_matrix'] = embedding_matrix
    splits = list(
        StratifiedKFold(n_splits=n_folds, shuffle=True).split(train_df['target'], train_df['target']))
    scores = []
    best_threshold = []
    best_threshold_global = None
    best_score = -1
    predictions_train_reduced = []
    targets_train = []
    predictions_tes_reduced = np.zeros((len(test_df), n_folds))
    predictions_te =  np.zeros((len(test_df),))
    for idx, (train_idx, valid_idx) in enumerate(splits):
        grow_df = train_df.iloc[train_idx].reset_index(drop=True)
        dev_df = train_df.iloc[valid_idx].reset_index(drop=True)
        predictions_te_i, predictions_va, targets_va, best_threshold_i = main(grow_df, dev_df, test_df, device,
                                                                              **kwargs,
                                                                              idx=idx,
                                                                              shared_resources=shared_resources,
                                                                              return_reduced=True)
        # predictions_va_raw shape (#len_va,n_models)
        predictions_tes_reduced[:, idx] = predictions_te_i
        scores.append([f1_score(targets_va, predictions_va > threshold), roc_auc_score(targets_va, predictions_va)])
        best_threshold.append(best_threshold_i)
        predictions_te += predictions_te_i / n_folds
        predictions_train_reduced.append(predictions_va)
        targets_train.append(targets_va)
    # calculate model coefficient
    coeff = (np.corrcoef(predictions_tes_reduced, rowvar=False).sum() - n_folds) / n_folds / (n_folds - 1)
    # create data set for stacking
    predictions_train_reduced = np.concatenate(predictions_train_reduced)
    targets_train = np.concatenate(targets_train)  # len_train
    # train optimal combining weights

    # simple average
    for t in np.arange(0, 1, 0.01):
        score = f1_score(targets_train, predictions_train_reduced > t)
        if score > best_score:
            best_score = score
            best_threshold_global = t
    print('avg of best threshold {} macro-f1 best threshold {} best score {}'.format(best_threshold,
                                                                                     best_threshold_global, best_score))
    return predictions_te, predictions_te, scores, best_threshold_global, coeff

In [26]:
#main routine
def main(train_df, valid_df, test_df, device=None, epochs=3, fine_tuning_epochs=3, batch_size=512, learning_rate=0.001,
         learning_rate_max_offset=0.001, dropout=0.1,
         threshold=None,
         max_vocab_size=95000, embed_size=300, max_seq_len=70, print_every_step=500, idx=0, shared_resources=None,
         return_reduced=True):
    if device is None:
        device = torch.device("cuda:{}".format(0) if torch.cuda.is_available() else "cpu")

    if shared_resources is None:
        shared_resources = {}
    batch_time = AverageMeter()
    data_time = AverageMeter()
    mean_len = AverageMeter()
    # build vocab of raw df

    if 'vocab' not in shared_resources:
        counter = build_counter(chain(train_df['question_text'], test_df['question_text']))
        vocab = build_vocab(counter, max_vocab_size=max_vocab_size)
    else:
        vocab = shared_resources['vocab']
    if 'embedding_matrix' not in shared_resources:
        embedding_matrix = load_embedding(vocab, max_vocab_size, embed_size)
    else:
        embedding_matrix = shared_resources['embedding_matrix']
    # create test dataset
    test_dataset = TextDataset(test_df, vocab=vocab, max_seq_len=max_seq_len)
    tb = BucketSampler(test_dataset, test_dataset.get_keys(), batch_size=batch_size,
                       shuffle_data=False)
    test_iter = DataLoader(dataset=test_dataset,
                           batch_size=batch_size,
                           sampler=tb,
                           # shuffle=False,
                           num_workers=0,
                           collate_fn=collate_fn)

    train_dataset = TextDataset(train_df, vocab=vocab, max_seq_len=max_seq_len)
    # keys = train_dataset.get_keys()  # for bucket sorting
    valid_dataset = TextDataset(valid_df, vocab=vocab, max_seq_len=max_seq_len)
    vb = BucketSampler(valid_dataset, valid_dataset.get_keys(), batch_size=batch_size,
                       shuffle_data=False)
    valid_index_reverse = vb.get_reverse_indexes()
    # init model and optimizers
    model = InsincereModel(device, hidden_dim=256, hidden_dim_fc=16, dropout=dropout,
                           embedding_matrixs=embedding_matrix,
                           vocab_size=len(vocab['token2id']),
                           embedding_dim=embed_size, max_seq_len=max_seq_len)
    if idx == 0:
        print(model)
        print('total trainable {}'.format(count_parameters(model)))
    model = model.to(device)
    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=learning_rate)

    # init iterator
    train_iter = DataLoader(dataset=train_dataset,
                            batch_size=batch_size,
                            # shuffle=True,
                            # sampler=NegativeSubSampler(train_dataset, train_dataset.targets),
                            sampler=BucketSampler(train_dataset, train_dataset.get_keys(), bucket_size=batch_size * 20,
                                                  batch_size=batch_size),
                            num_workers=0,
                            collate_fn=collate_fn)

    valid_iter = DataLoader(dataset=valid_dataset,
                            batch_size=batch_size,
                            sampler=vb,
                            # shuffle=False,
                            collate_fn=collate_fn)

    # train model

    loss_list = []
    global_steps = 0
    total_steps = epochs * len(train_iter)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    end = time.time()
    predictions_tes = []
    predictions_vas = []
    n_fge = 0
    clr = CyclicLR(optimizer, base_lr=learning_rate, max_lr=learning_rate + learning_rate_max_offset,
                   step_size=300, mode='exp_range')
    clr.on_train_begin()
    fine_tuning_epochs = epochs - fine_tuning_epochs
    predictions_te = None
    for epoch in tqdm(range(epochs)):

        fine_tuning = epoch >= fine_tuning_epochs
        start_fine_tuning = fine_tuning_epochs == epoch
        if start_fine_tuning:
            model.enable_learning_embedding()
            optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=learning_rate)
            # fine tuning embedding layer
            global_steps = 0
            total_steps = (epochs - fine_tuning_epochs) * len(train_iter)
            clr = CyclicLR(optimizer, base_lr=learning_rate, max_lr=learning_rate + learning_rate_max_offset,
                           step_size=int(len(train_iter) / 8))
            clr.on_train_begin()
            predictions_te = np.zeros((len(test_df),))
            predictions_va = np.zeros((len(valid_dataset.targets),))
        for batch_data in train_iter:
            data_time.update(time.time() - end)
            qids, src_sents, src_seqs, src_lens, tgts = batch_data
            mean_len.update(sum(src_lens))
            src_seqs = src_seqs.to(device)
            tgts = tgts.to(device)
            model.train()
            optimizer.zero_grad()

            out = model(src_seqs, src_lens, return_logits=True).view(-1)
            loss = loss_fn(out, tgts)
            loss.backward()
            optimizer.step()

            loss_list.append(loss.detach().to('cpu').item())

            global_steps += 1
            batch_time.update(time.time() - end)
            end = time.time()
            if global_steps % print_every_step == 0:
                curr_gpu_memory_usage = get_gpu_memory_usage(device_id=torch.cuda.current_device())
                print('Global step: {}/{} Total loss: {:.4f}  Current GPU memory '
                      'usage: {} maxlen {} '.format(global_steps, total_steps, avg(loss_list), curr_gpu_memory_usage,
                                                    mean_len.avg))
                loss_list = []

                # print(f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                #      f'Data {data_time.val:.3f} ({data_time.avg:.3f})\t')
            if fine_tuning and global_steps % (2 * clr.step_size) == 0:
                predictions_te_tmp2 = eval_model(model, test_iter, device)
                predictions_va_tmp2 = eval_model(model, valid_iter, device, valid_index_reverse)
                report_perf(valid_dataset, predictions_va_tmp2, threshold, idx, epoch,
                            desc='val set mean')
                predictions_te = predictions_te * n_fge + (
                    predictions_te_tmp2)
                predictions_va = predictions_va * n_fge + (
                    predictions_va_tmp2)
                predictions_te /= n_fge + 1
                predictions_va /= n_fge + 1
                report_perf(valid_dataset, predictions_va, threshold, idx, epoch
                            , desc='val set (fge)')
                predictions_tes.append(predictions_te_tmp2.reshape([-1, 1]))
                predictions_vas.append(predictions_va_tmp2.reshape([-1, 1]))
                n_fge += 1

            clr.on_batch_end()
        if not fine_tuning:
            predictions_va = eval_model(model, valid_iter, device, valid_index_reverse)
            report_perf(valid_dataset, predictions_va, threshold, idx, epoch)
    # pprint(model.attention1.bias.data.to('cpu'))
    # pprint(model.attention2.bias.data.to('cpu'))
    # reorder index
    if predictions_te is not None:
        predictions_te = predictions_te[tb.get_reverse_indexes()]
    else:
        predictions_te = eval_model(model, test_iter, device, tb.get_reverse_indexes())
    best_score = -1
    best_threshold = None
    for t in np.arange(0, 1, 0.01):
        score = f1_score(valid_dataset.targets, predictions_va > t)
        if score > best_score:
            best_score = score
            best_threshold = t
    print('best threshold on validation set: {:.2f} score {:.4f}'.format(best_threshold, best_score))
    if not return_reduced and len(predictions_vas) > 0:
        predictions_te = np.concatenate(predictions_tes, axis=1)
        predictions_te = predictions_te[tb.get_reverse_indexes(), :]
        predictions_va = np.concatenate(predictions_vas, axis=1)
    # # Save the model
    # model_path = 'trained_model.pth'
    # save_model(model, model_path)
    # print("Trained model saved to:", model_path)

    # # Make predictions
    # predictions_te, _, _, _ = main(train_df, test_df, test_df, **args)

    # # Calculate accuracy
    # accuracy = calculate_accuracy(test_df['target'].values, predictions_te)
    # print(f"Accuracy: {accuracy:.4f}")
    # make predictions
    return predictions_te, predictions_va, valid_dataset.targets, best_threshold

In [15]:
import torch
print(torch.__version__)
print(torch.version.cuda)

2.1.0.dev20230722+cu121
12.1


In [16]:
# seeding
set_seed(233)
epochs = 8
batch_size = 512
learning_rate = 0.001
learning_rate_max_offset = 0.002
fine_tuning_epochs = 2
threshold = 0.31
max_vocab_size = 120000
embed_size = 300
print_every_step = 500
max_seq_len = 70
share = True
dropout = 0.1
sub = pd.read_csv('../input/sample_submission.csv')
train_df, test_df = load_data()
# shuffling
trn_idx = np.random.permutation(len(train_df))
train_df = train_df.iloc[trn_idx].reset_index(drop=True)
n_folds = 5
n_repeats = 1
args = {'epochs': epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'threshold': threshold,
        'max_vocab_size': max_vocab_size,
        'embed_size': embed_size, 'print_every_step': print_every_step, 'dropout': dropout,
        'learning_rate_max_offset': learning_rate_max_offset,
        'fine_tuning_epochs': fine_tuning_epochs, 'max_seq_len': max_seq_len}
predictions_te_all = np.zeros((len(test_df),))
for _ in range(n_repeats):
    if n_folds > 1:
        _, predictions_te, _, threshold, coeffs = cv(train_df, test_df, n_folds=n_folds, share=share, **args)
        print('coeff between predictions {}'.format(coeffs))
    else:
        predictions_te, _, _, _ = main(train_df, test_df, test_df, **args)
    predictions_te_all += predictions_te / n_repeats
sub.prediction = predictions_te_all > threshold
sub.to_csv("submission.csv", index=False)

preprocssing 12.032110214233398s


building conuter: 1681928it [00:06, 261139.29it/s]
building vocab: 120000it [00:00, 1344579.60it/s]
tokenize: 100%|██████████| 1306122/1306122 [00:09<00:00, 141646.16it/s]
tokenize: 100%|██████████| 375806/375806 [00:02<00:00, 158546.41it/s]


oov 6264/308/297/761/120002
oov 6162/50165/0/0/120002


generate lens: 375806it [00:00, 1047527.24it/s]
generate lens: 261225it [00:00, 873327.87it/s]


InsincereModel(
  (embedding_0): Embedding(120002, 300)
  (embedding_1): Embedding(120002, 300)
  (embedding_mean): Embedding(120002, 300)
  (learnable_embedding): Embedding(120002, 300, padding_idx=0)
  (spatial_dropout): Dropout2d(p=0.2, inplace=False)
  (rnn0): LSTM(300, 128, batch_first=True, bidirectional=True)
  (rnn1): GRU(256, 128, batch_first=True, bidirectional=True)
  (capsule): Capsule()
  (dropout2): Dropout(p=0.3, inplace=False)
  (lincaps): Linear(in_features=25, out_features=1, bias=True)
  (attention1): Attention(
    (attention_fc): Linear(in_features=256, out_features=1, bias=True)
  )
  (attention2): Attention(
    (attention_fc): Linear(in_features=256, out_features=1, bias=True)
  )
  (fc): Linear(in_features=1025, out_features=16, bias=True)
  (norm): LayerNorm((1025,), eps=1e-05, elementwise_affine=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (dropout_linear): Dropout(p=0.1, inplace=False)
  (hidden2out): Linear(in_features=16, out_features=1, bias=True)


generate lens: 1044897it [00:01, 861464.05it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

Global step: 500/16328 Total loss: 0.1391  Current GPU memory usage: 1735 maxlen 7561.488 
Global step: 1000/16328 Total loss: 0.1131  Current GPU memory usage: 1735 maxlen 7559.982 
Global step: 1500/16328 Total loss: 0.1104  Current GPU memory usage: 1735 maxlen 7562.858666666667 
Global step: 2000/16328 Total loss: 0.1080  Current GPU memory usage: 1735 maxlen 7558.7305 


 12%|█▎        | 1/8 [02:06<14:48, 126.95s/it]

idx 0 epoch 0 val set f1 : 0.6689 auc : 0.9639 margin : 0.0598
Global step: 2500/16328 Total loss: 0.1037  Current GPU memory usage: 1735 maxlen 7560.096 
Global step: 3000/16328 Total loss: 0.1029  Current GPU memory usage: 1735 maxlen 7559.910333333333 
Global step: 3500/16328 Total loss: 0.1015  Current GPU memory usage: 1735 maxlen 7559.757142857143 
Global step: 4000/16328 Total loss: 0.1009  Current GPU memory usage: 1735 maxlen 7559.26375 


 25%|██▌       | 2/8 [04:03<12:03, 120.60s/it]

idx 0 epoch 1 val set f1 : 0.6745 auc : 0.9674 margin : 0.0593
Global step: 4500/16328 Total loss: 0.0956  Current GPU memory usage: 1735 maxlen 7555.108 
Global step: 5000/16328 Total loss: 0.0966  Current GPU memory usage: 1735 maxlen 7558.5134 
Global step: 5500/16328 Total loss: 0.0962  Current GPU memory usage: 1735 maxlen 7558.641636363636 
Global step: 6000/16328 Total loss: 0.0968  Current GPU memory usage: 1735 maxlen 7559.131 


 38%|███▊      | 3/8 [05:58<09:51, 118.25s/it]

idx 0 epoch 2 val set f1 : 0.6886 auc : 0.9698 margin : 0.0551
Global step: 6500/16328 Total loss: 0.0913  Current GPU memory usage: 1735 maxlen 7557.107692307693 
Global step: 7000/16328 Total loss: 0.0900  Current GPU memory usage: 1735 maxlen 7557.493 
Global step: 7500/16328 Total loss: 0.0923  Current GPU memory usage: 1735 maxlen 7557.6736 
Global step: 8000/16328 Total loss: 0.0927  Current GPU memory usage: 1735 maxlen 7557.80225 


 50%|█████     | 4/8 [07:53<07:48, 117.03s/it]

idx 0 epoch 3 val set f1 : 0.6874 auc : 0.9695 margin : 0.0592
Global step: 8500/16328 Total loss: 0.0877  Current GPU memory usage: 1735 maxlen 7559.228588235294 
Global step: 9000/16328 Total loss: 0.0867  Current GPU memory usage: 1735 maxlen 7557.801111111111 
Global step: 9500/16328 Total loss: 0.0875  Current GPU memory usage: 1735 maxlen 7557.525368421053 
Global step: 10000/16328 Total loss: 0.0874  Current GPU memory usage: 1735 maxlen 7558.337 


 62%|██████▎   | 5/8 [09:49<05:49, 116.42s/it]

idx 0 epoch 4 val set f1 : 0.6912 auc : 0.9702 margin : 0.0507
Global step: 10500/16328 Total loss: 0.0836  Current GPU memory usage: 1735 maxlen 7558.371904761905 
Global step: 11000/16328 Total loss: 0.0821  Current GPU memory usage: 1735 maxlen 7556.566181818182 
Global step: 11500/16328 Total loss: 0.0843  Current GPU memory usage: 1735 maxlen 7559.022434782609 
Global step: 12000/16328 Total loss: 0.0835  Current GPU memory usage: 1735 maxlen 7558.644083333334 


 75%|███████▌  | 6/8 [11:44<03:52, 116.03s/it]

idx 0 epoch 5 val set f1 : 0.6861 auc : 0.9693 margin : 0.0508
Global step: 500/4082 Total loss: 0.0836  Current GPU memory usage: 2024 maxlen 7558.218107641613 
idx 0 epoch 6 val set mean f1 : 0.6867 auc : 0.9685 margin : 0.0496
idx 0 epoch 6 val set (fge) f1 : 0.6867 auc : 0.9685 margin : 0.0496
Global step: 1000/4082 Total loss: 0.0831  Current GPU memory usage: 2024 maxlen 7558.353616186018 
idx 0 epoch 6 val set mean f1 : 0.6885 auc : 0.9685 margin : 0.0539
idx 0 epoch 6 val set (fge) f1 : 0.6942 auc : 0.9700 margin : 0.0517
Global step: 1500/4082 Total loss: 0.0859  Current GPU memory usage: 2024 maxlen 7558.596318929143 
idx 0 epoch 6 val set mean f1 : 0.6873 auc : 0.9693 margin : 0.0511
idx 0 epoch 6 val set (fge) f1 : 0.6961 auc : 0.9709 margin : 0.0515
Global step: 2000/4082 Total loss: 0.0862  Current GPU memory usage: 2024 maxlen 7558.420749684122 
idx 0 epoch 6 val set mean f1 : 0.6930 auc : 0.9698 margin : 0.0509
idx 0 epoch 6 val set (fge) f1 : 0.6998 auc : 0.9715 margin

 88%|████████▊ | 7/8 [15:47<02:37, 157.43s/it]

Global step: 2500/4082 Total loss: 0.0699  Current GPU memory usage: 2024 maxlen 7558.167842126679 
idx 0 epoch 7 val set mean f1 : 0.6837 auc : 0.9672 margin : 0.0500
idx 0 epoch 7 val set (fge) f1 : 0.7007 auc : 0.9717 margin : 0.0511
Global step: 3000/4082 Total loss: 0.0708  Current GPU memory usage: 2024 maxlen 7558.195133149678 
idx 0 epoch 7 val set mean f1 : 0.6812 auc : 0.9673 margin : 0.0516
idx 0 epoch 7 val set (fge) f1 : 0.7015 auc : 0.9717 margin : 0.0512
Global step: 3500/4082 Total loss: 0.0729  Current GPU memory usage: 2024 maxlen 7558.180236250476 
idx 0 epoch 7 val set mean f1 : 0.6812 auc : 0.9671 margin : 0.0506
idx 0 epoch 7 val set (fge) f1 : 0.7016 auc : 0.9717 margin : 0.0511
Global step: 4000/4082 Total loss: 0.0752  Current GPU memory usage: 2024 maxlen 7558.4679305675245 
idx 0 epoch 7 val set mean f1 : 0.6818 auc : 0.9676 margin : 0.0500
idx 0 epoch 7 val set (fge) f1 : 0.7005 auc : 0.9718 margin : 0.0510


100%|██████████| 8/8 [19:49<00:00, 148.70s/it]


best threshold on validation set: 0.29 score 0.7008


generate lens: 375806it [00:00, 1077460.25it/s]
generate lens: 261225it [00:00, 938752.95it/s]
generate lens: 1044897it [00:01, 969572.85it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

Global step: 500/16328 Total loss: 0.1400  Current GPU memory usage: 2024 maxlen 7570.062 
Global step: 1000/16328 Total loss: 0.1153  Current GPU memory usage: 2024 maxlen 7555.91 
Global step: 1500/16328 Total loss: 0.1092  Current GPU memory usage: 2024 maxlen 7553.864 
Global step: 2000/16328 Total loss: 0.1069  Current GPU memory usage: 2024 maxlen 7554.3625 


 12%|█▎        | 1/8 [01:54<13:24, 114.94s/it]

idx 1 epoch 0 val set f1 : 0.6607 auc : 0.9629 margin : 0.0647
Global step: 2500/16328 Total loss: 0.1037  Current GPU memory usage: 2024 maxlen 7552.5432 
Global step: 3000/16328 Total loss: 0.1020  Current GPU memory usage: 2024 maxlen 7555.788666666666 
Global step: 3500/16328 Total loss: 0.1019  Current GPU memory usage: 2024 maxlen 7552.6197142857145 
Global step: 4000/16328 Total loss: 0.1000  Current GPU memory usage: 2024 maxlen 7554.69325 


 25%|██▌       | 2/8 [03:49<11:29, 114.99s/it]

idx 1 epoch 1 val set f1 : 0.6811 auc : 0.9672 margin : 0.0568
Global step: 4500/16328 Total loss: 0.0958  Current GPU memory usage: 2024 maxlen 7554.091111111111 
Global step: 5000/16328 Total loss: 0.0952  Current GPU memory usage: 2024 maxlen 7555.4988 
Global step: 5500/16328 Total loss: 0.0962  Current GPU memory usage: 2024 maxlen 7553.9729090909095 
Global step: 6000/16328 Total loss: 0.0949  Current GPU memory usage: 2024 maxlen 7555.149833333333 


 38%|███▊      | 3/8 [05:45<09:35, 115.04s/it]

idx 1 epoch 2 val set f1 : 0.6873 auc : 0.9686 margin : 0.0549
Global step: 6500/16328 Total loss: 0.0909  Current GPU memory usage: 2024 maxlen 7552.566769230769 
Global step: 7000/16328 Total loss: 0.0912  Current GPU memory usage: 2024 maxlen 7554.703 
Global step: 7500/16328 Total loss: 0.0913  Current GPU memory usage: 2024 maxlen 7554.7116 
Global step: 8000/16328 Total loss: 0.0910  Current GPU memory usage: 2024 maxlen 7555.710875 


 50%|█████     | 4/8 [07:40<07:40, 115.03s/it]

idx 1 epoch 3 val set f1 : 0.6857 auc : 0.9690 margin : 0.0556
Global step: 8500/16328 Total loss: 0.0865  Current GPU memory usage: 2024 maxlen 7553.9396470588235 
Global step: 9000/16328 Total loss: 0.0868  Current GPU memory usage: 2024 maxlen 7554.9835555555555 
Global step: 9500/16328 Total loss: 0.0872  Current GPU memory usage: 2024 maxlen 7552.796105263158 
Global step: 10000/16328 Total loss: 0.0876  Current GPU memory usage: 2024 maxlen 7554.7513 


 62%|██████▎   | 5/8 [09:35<05:45, 115.03s/it]

idx 1 epoch 4 val set f1 : 0.6886 auc : 0.9696 margin : 0.0539
Global step: 10500/16328 Total loss: 0.0824  Current GPU memory usage: 2024 maxlen 7551.9489523809525 
Global step: 11000/16328 Total loss: 0.0824  Current GPU memory usage: 2024 maxlen 7553.819181818182 
Global step: 11500/16328 Total loss: 0.0826  Current GPU memory usage: 2024 maxlen 7553.825739130435 
Global step: 12000/16328 Total loss: 0.0838  Current GPU memory usage: 2024 maxlen 7554.24225 


 75%|███████▌  | 6/8 [11:29<03:49, 114.88s/it]

idx 1 epoch 5 val set f1 : 0.6831 auc : 0.9688 margin : 0.0556
Global step: 500/4082 Total loss: 0.0817  Current GPU memory usage: 2024 maxlen 7554.310999529264 
idx 1 epoch 6 val set mean f1 : 0.6841 auc : 0.9679 margin : 0.0521
idx 1 epoch 6 val set (fge) f1 : 0.6841 auc : 0.9679 margin : 0.0521
Global step: 1000/4082 Total loss: 0.0844  Current GPU memory usage: 2024 maxlen 7554.293371583874 
idx 1 epoch 6 val set mean f1 : 0.6806 auc : 0.9684 margin : 0.0547
idx 1 epoch 6 val set (fge) f1 : 0.6892 auc : 0.9697 margin : 0.0534
Global step: 1500/4082 Total loss: 0.0853  Current GPU memory usage: 2024 maxlen 7553.960424850866 
idx 1 epoch 6 val set mean f1 : 0.6839 auc : 0.9686 margin : 0.0518
idx 1 epoch 6 val set (fge) f1 : 0.6933 auc : 0.9704 margin : 0.0529
Global step: 2000/4082 Total loss: 0.0870  Current GPU memory usage: 2024 maxlen 7554.1293696476205 
idx 1 epoch 6 val set mean f1 : 0.6893 auc : 0.9693 margin : 0.0506
idx 1 epoch 6 val set (fge) f1 : 0.6961 auc : 0.9710 margi

 88%|████████▊ | 7/8 [15:31<02:36, 156.41s/it]

Global step: 2500/4082 Total loss: 0.0688  Current GPU memory usage: 2024 maxlen 7553.7648853926485 
idx 1 epoch 7 val set mean f1 : 0.6795 auc : 0.9668 margin : 0.0497
idx 1 epoch 7 val set (fge) f1 : 0.6971 auc : 0.9710 margin : 0.0518
Global step: 3000/4082 Total loss: 0.0706  Current GPU memory usage: 2024 maxlen 7554.164370982553 
idx 1 epoch 7 val set mean f1 : 0.6790 auc : 0.9673 margin : 0.0501
idx 1 epoch 7 val set (fge) f1 : 0.6962 auc : 0.9711 margin : 0.0515
Global step: 3500/4082 Total loss: 0.0717  Current GPU memory usage: 2024 maxlen 7553.862568271307 
idx 1 epoch 7 val set mean f1 : 0.6796 auc : 0.9671 margin : 0.0488
idx 1 epoch 7 val set (fge) f1 : 0.6968 auc : 0.9711 margin : 0.0511
Global step: 4000/4082 Total loss: 0.0740  Current GPU memory usage: 2024 maxlen 7553.648036439739 
idx 1 epoch 7 val set mean f1 : 0.6802 auc : 0.9667 margin : 0.0501
idx 1 epoch 7 val set (fge) f1 : 0.6966 auc : 0.9712 margin : 0.0510


100%|██████████| 8/8 [19:33<00:00, 146.71s/it]


best threshold on validation set: 0.31 score 0.6966


generate lens: 375806it [00:00, 1111130.81it/s]
generate lens: 261224it [00:00, 922607.65it/s]
generate lens: 1044898it [00:01, 965544.68it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

Global step: 500/16328 Total loss: 0.1380  Current GPU memory usage: 2024 maxlen 7563.98 
Global step: 1000/16328 Total loss: 0.1155  Current GPU memory usage: 2024 maxlen 7559.483 
Global step: 1500/16328 Total loss: 0.1090  Current GPU memory usage: 2024 maxlen 7554.685333333333 
Global step: 2000/16328 Total loss: 0.1072  Current GPU memory usage: 2024 maxlen 7555.57 


 12%|█▎        | 1/8 [01:57<13:43, 117.62s/it]

idx 2 epoch 0 val set f1 : 0.6553 auc : 0.9636 margin : 0.0714
Global step: 2500/16328 Total loss: 0.1050  Current GPU memory usage: 2024 maxlen 7556.1948 
Global step: 3000/16328 Total loss: 0.1023  Current GPU memory usage: 2024 maxlen 7556.0453333333335 
Global step: 3500/16328 Total loss: 0.1007  Current GPU memory usage: 2024 maxlen 7556.563428571429 
Global step: 4000/16328 Total loss: 0.0998  Current GPU memory usage: 2024 maxlen 7552.63525 


 25%|██▌       | 2/8 [05:17<16:34, 165.81s/it]

idx 2 epoch 1 val set f1 : 0.6815 auc : 0.9678 margin : 0.0571
Global step: 4500/16328 Total loss: 0.0956  Current GPU memory usage: 2024 maxlen 7553.695555555556 
Global step: 5000/16328 Total loss: 0.0953  Current GPU memory usage: 2024 maxlen 7556.2428 
Global step: 5500/16328 Total loss: 0.0970  Current GPU memory usage: 2024 maxlen 7556.235272727273 
Global step: 6000/16328 Total loss: 0.0955  Current GPU memory usage: 2024 maxlen 7556.494 


 38%|███▊      | 3/8 [08:35<15:03, 180.65s/it]

idx 2 epoch 2 val set f1 : 0.6896 auc : 0.9696 margin : 0.0529
Global step: 6500/16328 Total loss: 0.0916  Current GPU memory usage: 2024 maxlen 7556.647384615385 
Global step: 7000/16328 Total loss: 0.0905  Current GPU memory usage: 2024 maxlen 7556.623285714286 
Global step: 7500/16328 Total loss: 0.0915  Current GPU memory usage: 2024 maxlen 7556.6252 
Global step: 8000/16328 Total loss: 0.0914  Current GPU memory usage: 2024 maxlen 7557.058 


 50%|█████     | 4/8 [11:53<12:30, 187.64s/it]

idx 2 epoch 3 val set f1 : 0.6922 auc : 0.9698 margin : 0.0517
Global step: 8500/16328 Total loss: 0.0872  Current GPU memory usage: 2024 maxlen 7555.6085882352945 
Global step: 9000/16328 Total loss: 0.0871  Current GPU memory usage: 2024 maxlen 7556.728555555555 
Global step: 9500/16328 Total loss: 0.0868  Current GPU memory usage: 2024 maxlen 7555.690315789474 
Global step: 10000/16328 Total loss: 0.0867  Current GPU memory usage: 2024 maxlen 7556.4155 


 62%|██████▎   | 5/8 [15:12<09:34, 191.52s/it]

idx 2 epoch 4 val set f1 : 0.6942 auc : 0.9704 margin : 0.0541
Global step: 10500/16328 Total loss: 0.0831  Current GPU memory usage: 2024 maxlen 7554.667142857143 
Global step: 11000/16328 Total loss: 0.0819  Current GPU memory usage: 2024 maxlen 7555.7292727272725 
Global step: 11500/16328 Total loss: 0.0824  Current GPU memory usage: 2024 maxlen 7555.761217391304 
Global step: 12000/16328 Total loss: 0.0841  Current GPU memory usage: 2024 maxlen 7554.2981666666665 


 75%|███████▌  | 6/8 [18:30<06:27, 193.87s/it]

idx 2 epoch 5 val set f1 : 0.6918 auc : 0.9696 margin : 0.0522
Global step: 500/4082 Total loss: 0.0817  Current GPU memory usage: 2024 maxlen 7555.063784716774 
idx 2 epoch 6 val set mean f1 : 0.6862 auc : 0.9686 margin : 0.0551
idx 2 epoch 6 val set (fge) f1 : 0.6862 auc : 0.9686 margin : 0.0551
Global step: 1000/4082 Total loss: 0.0841  Current GPU memory usage: 2024 maxlen 7555.071342292013 
idx 2 epoch 6 val set mean f1 : 0.6904 auc : 0.9694 margin : 0.0517
idx 2 epoch 6 val set (fge) f1 : 0.6952 auc : 0.9705 margin : 0.0534
Global step: 1500/4082 Total loss: 0.0861  Current GPU memory usage: 2024 maxlen 7555.7518550851155 
idx 2 epoch 6 val set mean f1 : 0.6901 auc : 0.9700 margin : 0.0510
idx 2 epoch 6 val set (fge) f1 : 0.6994 auc : 0.9714 margin : 0.0526
Global step: 2000/4082 Total loss: 0.0873  Current GPU memory usage: 2024 maxlen 7555.538817913801 
idx 2 epoch 6 val set mean f1 : 0.6924 auc : 0.9697 margin : 0.0506
idx 2 epoch 6 val set (fge) f1 : 0.7016 auc : 0.9718 margi

 88%|████████▊ | 7/8 [25:25<04:26, 266.25s/it]

Global step: 2500/4082 Total loss: 0.0691  Current GPU memory usage: 2024 maxlen 7555.50501831005 
idx 2 epoch 7 val set mean f1 : 0.6835 auc : 0.9669 margin : 0.0490
idx 2 epoch 7 val set (fge) f1 : 0.7011 auc : 0.9718 margin : 0.0515
Global step: 3000/4082 Total loss: 0.0706  Current GPU memory usage: 2024 maxlen 7555.640955004591 
idx 2 epoch 7 val set mean f1 : 0.6824 auc : 0.9668 margin : 0.0510
idx 2 epoch 7 val set (fge) f1 : 0.7018 auc : 0.9718 margin : 0.0514
Global step: 3500/4082 Total loss: 0.0727  Current GPU memory usage: 2024 maxlen 7555.300076209831 
idx 2 epoch 7 val set mean f1 : 0.6869 auc : 0.9682 margin : 0.0513
idx 2 epoch 7 val set (fge) f1 : 0.7015 auc : 0.9719 margin : 0.0514
Global step: 4000/4082 Total loss: 0.0760  Current GPU memory usage: 2024 maxlen 7555.317555090483 
idx 2 epoch 7 val set mean f1 : 0.6879 auc : 0.9679 margin : 0.0505
idx 2 epoch 7 val set (fge) f1 : 0.7018 auc : 0.9720 margin : 0.0513


100%|██████████| 8/8 [32:21<00:00, 242.65s/it]


best threshold on validation set: 0.34 score 0.7021


generate lens: 375806it [00:00, 1065349.75it/s]
generate lens: 261224it [00:00, 986941.29it/s]
generate lens: 1044898it [00:01, 1000400.80it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

Global step: 500/16328 Total loss: 0.1383  Current GPU memory usage: 2024 maxlen 7553.494 
Global step: 1000/16328 Total loss: 0.1148  Current GPU memory usage: 2024 maxlen 7549.947 
Global step: 1500/16328 Total loss: 0.1091  Current GPU memory usage: 2024 maxlen 7552.907333333334 
Global step: 2000/16328 Total loss: 0.1078  Current GPU memory usage: 2024 maxlen 7553.713 


 12%|█▎        | 1/8 [03:18<23:07, 198.19s/it]

idx 3 epoch 0 val set f1 : 0.6603 auc : 0.9617 margin : 0.0571
Global step: 2500/16328 Total loss: 0.1039  Current GPU memory usage: 2024 maxlen 7554.2844 
Global step: 3000/16328 Total loss: 0.1011  Current GPU memory usage: 2024 maxlen 7551.841666666666 
Global step: 3500/16328 Total loss: 0.1012  Current GPU memory usage: 2024 maxlen 7552.491714285714 
Global step: 4000/16328 Total loss: 0.1001  Current GPU memory usage: 2024 maxlen 7552.5765 


 25%|██▌       | 2/8 [06:36<19:49, 198.29s/it]

idx 3 epoch 1 val set f1 : 0.6743 auc : 0.9660 margin : 0.0586
Global step: 4500/16328 Total loss: 0.0970  Current GPU memory usage: 2024 maxlen 7554.174666666667 
Global step: 5000/16328 Total loss: 0.0946  Current GPU memory usage: 2024 maxlen 7553.951 
Global step: 5500/16328 Total loss: 0.0943  Current GPU memory usage: 2024 maxlen 7553.288 
Global step: 6000/16328 Total loss: 0.0970  Current GPU memory usage: 2024 maxlen 7552.526666666667 


 38%|███▊      | 3/8 [09:54<16:31, 198.34s/it]

idx 3 epoch 2 val set f1 : 0.6752 auc : 0.9674 margin : 0.0574
Global step: 6500/16328 Total loss: 0.0900  Current GPU memory usage: 2024 maxlen 7551.606615384615 
Global step: 7000/16328 Total loss: 0.0913  Current GPU memory usage: 2024 maxlen 7552.415 
Global step: 7500/16328 Total loss: 0.0900  Current GPU memory usage: 2024 maxlen 7551.210666666667 
Global step: 8000/16328 Total loss: 0.0918  Current GPU memory usage: 2024 maxlen 7552.979375 


 50%|█████     | 4/8 [13:13<13:13, 198.36s/it]

idx 3 epoch 3 val set f1 : 0.6820 auc : 0.9681 margin : 0.0544
Global step: 8500/16328 Total loss: 0.0868  Current GPU memory usage: 2024 maxlen 7551.794 
Global step: 9000/16328 Total loss: 0.0861  Current GPU memory usage: 2024 maxlen 7551.571666666667 
Global step: 9500/16328 Total loss: 0.0872  Current GPU memory usage: 2024 maxlen 7551.742736842105 
Global step: 10000/16328 Total loss: 0.0862  Current GPU memory usage: 2024 maxlen 7550.9046 


 62%|██████▎   | 5/8 [16:31<09:55, 198.36s/it]

idx 3 epoch 4 val set f1 : 0.6836 auc : 0.9686 margin : 0.0534
Global step: 10500/16328 Total loss: 0.0828  Current GPU memory usage: 2024 maxlen 7551.987904761905 
Global step: 11000/16328 Total loss: 0.0827  Current GPU memory usage: 2024 maxlen 7552.528727272727 
Global step: 11500/16328 Total loss: 0.0823  Current GPU memory usage: 2024 maxlen 7552.089565217391 
Global step: 12000/16328 Total loss: 0.0830  Current GPU memory usage: 2024 maxlen 7551.485 


 75%|███████▌  | 6/8 [19:50<06:36, 198.41s/it]

idx 3 epoch 5 val set f1 : 0.6834 auc : 0.9683 margin : 0.0531
Global step: 500/4082 Total loss: 0.0818  Current GPU memory usage: 2024 maxlen 7551.546838223757 
idx 3 epoch 6 val set mean f1 : 0.6811 auc : 0.9668 margin : 0.0513
idx 3 epoch 6 val set (fge) f1 : 0.6811 auc : 0.9668 margin : 0.0513
Global step: 1000/4082 Total loss: 0.0836  Current GPU memory usage: 2024 maxlen 7551.764683678091 
idx 3 epoch 6 val set mean f1 : 0.6819 auc : 0.9677 margin : 0.0524
idx 3 epoch 6 val set (fge) f1 : 0.6891 auc : 0.9689 margin : 0.0519
Global step: 1500/4082 Total loss: 0.0855  Current GPU memory usage: 2024 maxlen 7551.950967554198 
idx 3 epoch 6 val set mean f1 : 0.6853 auc : 0.9682 margin : 0.0517
idx 3 epoch 6 val set (fge) f1 : 0.6923 auc : 0.9697 margin : 0.0518
Global step: 2000/4082 Total loss: 0.0865  Current GPU memory usage: 2024 maxlen 7552.051944405447 
idx 3 epoch 6 val set mean f1 : 0.6843 auc : 0.9684 margin : 0.0494
idx 3 epoch 6 val set (fge) f1 : 0.6933 auc : 0.9702 margin

 88%|████████▊ | 7/8 [26:45<04:29, 269.28s/it]

Global step: 2500/4082 Total loss: 0.0692  Current GPU memory usage: 2024 maxlen 7551.969211989692 
idx 3 epoch 7 val set mean f1 : 0.6742 auc : 0.9653 margin : 0.0497
idx 3 epoch 7 val set (fge) f1 : 0.6922 auc : 0.9702 margin : 0.0509
Global step: 3000/4082 Total loss: 0.0694  Current GPU memory usage: 2024 maxlen 7552.0008526826705 
idx 3 epoch 7 val set mean f1 : 0.6755 auc : 0.9663 margin : 0.0507
idx 3 epoch 7 val set (fge) f1 : 0.6921 auc : 0.9703 margin : 0.0509
Global step: 3500/4082 Total loss: 0.0725  Current GPU memory usage: 2024 maxlen 7551.957830560143 
idx 3 epoch 7 val set mean f1 : 0.6746 auc : 0.9665 margin : 0.0516
idx 3 epoch 7 val set (fge) f1 : 0.6931 auc : 0.9704 margin : 0.0510
Global step: 4000/4082 Total loss: 0.0754  Current GPU memory usage: 2024 maxlen 7552.315646928475 
idx 3 epoch 7 val set mean f1 : 0.6795 auc : 0.9665 margin : 0.0506
idx 3 epoch 7 val set (fge) f1 : 0.6929 auc : 0.9705 margin : 0.0509


100%|██████████| 8/8 [33:40<00:00, 252.59s/it]


best threshold on validation set: 0.32 score 0.6931


generate lens: 375806it [00:00, 1110195.60it/s]
generate lens: 261224it [00:00, 1011171.51it/s]
generate lens: 1044898it [00:01, 1013476.23it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

Global step: 500/16328 Total loss: 0.1387  Current GPU memory usage: 2024 maxlen 7556.674 
Global step: 1000/16328 Total loss: 0.1147  Current GPU memory usage: 2024 maxlen 7555.84 
Global step: 1500/16328 Total loss: 0.1097  Current GPU memory usage: 2024 maxlen 7554.315333333333 
Global step: 2000/16328 Total loss: 0.1080  Current GPU memory usage: 2024 maxlen 7556.012 


 12%|█▎        | 1/8 [03:18<23:07, 198.25s/it]

idx 4 epoch 0 val set f1 : 0.6697 auc : 0.9644 margin : 0.0601
Global step: 2500/16328 Total loss: 0.1058  Current GPU memory usage: 2024 maxlen 7554.188 
Global step: 3000/16328 Total loss: 0.1026  Current GPU memory usage: 2024 maxlen 7556.005666666667 
Global step: 3500/16328 Total loss: 0.1015  Current GPU memory usage: 2024 maxlen 7556.542285714286 
Global step: 4000/16328 Total loss: 0.0989  Current GPU memory usage: 2024 maxlen 7555.0055 


 25%|██▌       | 2/8 [06:36<19:50, 198.37s/it]

idx 4 epoch 1 val set f1 : 0.6836 auc : 0.9674 margin : 0.0574
Global step: 4500/16328 Total loss: 0.0971  Current GPU memory usage: 2024 maxlen 7556.230222222222 
Global step: 5000/16328 Total loss: 0.0959  Current GPU memory usage: 2024 maxlen 7554.7964 
Global step: 5500/16328 Total loss: 0.0961  Current GPU memory usage: 2024 maxlen 7558.110363636363 
Global step: 6000/16328 Total loss: 0.0959  Current GPU memory usage: 2024 maxlen 7556.116333333333 


 38%|███▊      | 3/8 [09:55<16:32, 198.41s/it]

idx 4 epoch 2 val set f1 : 0.6907 auc : 0.9694 margin : 0.0542
Global step: 6500/16328 Total loss: 0.0924  Current GPU memory usage: 2024 maxlen 7556.4478461538465 
Global step: 7000/16328 Total loss: 0.0900  Current GPU memory usage: 2024 maxlen 7556.024285714286 
Global step: 7500/16328 Total loss: 0.0918  Current GPU memory usage: 2024 maxlen 7555.7996 
Global step: 8000/16328 Total loss: 0.0922  Current GPU memory usage: 2024 maxlen 7556.50275 


 50%|█████     | 4/8 [13:13<13:13, 198.38s/it]

idx 4 epoch 3 val set f1 : 0.6886 auc : 0.9692 margin : 0.0554
Global step: 8500/16328 Total loss: 0.0856  Current GPU memory usage: 2024 maxlen 7554.821764705883 
Global step: 9000/16328 Total loss: 0.0869  Current GPU memory usage: 2024 maxlen 7554.077777777778 
Global step: 9500/16328 Total loss: 0.0865  Current GPU memory usage: 2024 maxlen 7554.29852631579 
Global step: 10000/16328 Total loss: 0.0862  Current GPU memory usage: 2024 maxlen 7553.5382 


 62%|██████▎   | 5/8 [16:31<09:55, 198.36s/it]

idx 4 epoch 4 val set f1 : 0.6929 auc : 0.9698 margin : 0.0517
Global step: 10500/16328 Total loss: 0.0848  Current GPU memory usage: 2024 maxlen 7556.075047619048 
Global step: 11000/16328 Total loss: 0.0819  Current GPU memory usage: 2024 maxlen 7555.729727272727 
Global step: 11500/16328 Total loss: 0.0830  Current GPU memory usage: 2024 maxlen 7554.329565217391 
Global step: 12000/16328 Total loss: 0.0842  Current GPU memory usage: 2024 maxlen 7556.31375 


 75%|███████▌  | 6/8 [19:50<06:36, 198.36s/it]

idx 4 epoch 5 val set f1 : 0.6912 auc : 0.9698 margin : 0.0548
Global step: 500/4082 Total loss: 0.0822  Current GPU memory usage: 2024 maxlen 7555.285422877766 
idx 4 epoch 6 val set mean f1 : 0.6880 auc : 0.9686 margin : 0.0500
idx 4 epoch 6 val set (fge) f1 : 0.6880 auc : 0.9686 margin : 0.0500
Global step: 1000/4082 Total loss: 0.0837  Current GPU memory usage: 2024 maxlen 7555.093311188283 
idx 4 epoch 6 val set mean f1 : 0.6903 auc : 0.9688 margin : 0.0504
idx 4 epoch 6 val set (fge) f1 : 0.6940 auc : 0.9702 margin : 0.0502
Global step: 1500/4082 Total loss: 0.0858  Current GPU memory usage: 2024 maxlen 7555.098937872835 
idx 4 epoch 6 val set mean f1 : 0.6848 auc : 0.9694 margin : 0.0534
idx 4 epoch 6 val set (fge) f1 : 0.6968 auc : 0.9710 margin : 0.0513
Global step: 2000/4082 Total loss: 0.0862  Current GPU memory usage: 2024 maxlen 7555.298399550751 
idx 4 epoch 6 val set mean f1 : 0.6880 auc : 0.9697 margin : 0.0525
idx 4 epoch 6 val set (fge) f1 : 0.6973 auc : 0.9716 margin

 88%|████████▊ | 7/8 [26:47<04:30, 270.04s/it]

Global step: 2500/4082 Total loss: 0.0691  Current GPU memory usage: 2024 maxlen 7555.40600840906 
idx 4 epoch 7 val set mean f1 : 0.6837 auc : 0.9679 margin : 0.0488
idx 4 epoch 7 val set (fge) f1 : 0.6988 auc : 0.9717 margin : 0.0510
Global step: 3000/4082 Total loss: 0.0705  Current GPU memory usage: 2024 maxlen 7555.167519349337 
idx 4 epoch 7 val set mean f1 : 0.6820 auc : 0.9675 margin : 0.0482
idx 4 epoch 7 val set (fge) f1 : 0.6979 auc : 0.9718 margin : 0.0505
Global step: 3500/4082 Total loss: 0.0736  Current GPU memory usage: 2024 maxlen 7555.5393750793855 
idx 4 epoch 7 val set mean f1 : 0.6838 auc : 0.9675 margin : 0.0504
idx 4 epoch 7 val set (fge) f1 : 0.6988 auc : 0.9719 margin : 0.0505
Global step: 4000/4082 Total loss: 0.0733  Current GPU memory usage: 2024 maxlen 7555.439492798228 
idx 4 epoch 7 val set mean f1 : 0.6820 auc : 0.9677 margin : 0.0507
idx 4 epoch 7 val set (fge) f1 : 0.6994 auc : 0.9719 margin : 0.0505


100%|██████████| 8/8 [33:46<00:00, 253.28s/it]


best threshold on validation set: 0.36 score 0.7005
avg of best threshold [0.29, 0.31, 0.34, 0.32, 0.36] macro-f1 best threshold 0.31 best score 0.6982523831139354
coeff between predictions 0.9619804722195002


In [None]:
# Save the trained model
model_path = 'trained_model.pth'
model_to_save = InsincereModel(
    device=None,
    hidden_dim=256,
    hidden_dim_fc=16,
    dropout=dropout,
    embedding_matrixs=embedding_matrix,
    vocab_size=len(vocab['token2id']),
    embedding_dim=embed_size,
    max_seq_len=max_seq_len
)
torch.save(model_to_save.state_dict(), model_path)
print("Trained model saved to:", model_path)


In [35]:
# Save the trained model
def save_model(model, model_path):
    torch.save(model.state_dict(), model_path)

# Load the model
def load_model(model_architecture, model_path, device, hidden_dim, hidden_dim_fc, embedding_matrixs,
               vocab_size, embedding_dim, max_seq_len):
    model = model_architecture(device, hidden_dim=hidden_dim, hidden_dim_fc=hidden_dim_fc, dropout=dropout,
                               embedding_matrixs=embedding_matrixs,
                               vocab_size=vocab_size, embedding_dim=embedding_dim, max_seq_len=max_seq_len)
    model.load_state_dict(torch.load(model_path))
    return model

In [29]:
# Save the trained model
model_path = 'trained_model.pth'
save_model(sub, model_path)
print("Trained model saved to:", model_path)

AttributeError: 'DataFrame' object has no attribute 'state_dict'

In [23]:
# Load the trained model
loaded_model = load_model(InsincereModel, model_path)

TypeError: InsincereModel.__init__() missing 4 required positional arguments: 'device', 'hidden_dim', 'hidden_dim_fc', and 'embedding_matrixs'

In [20]:
# Make predictions on the test set using the loaded model
predictions_te = eval_model(loaded_model, test_iter, device)

In [39]:
# Define the InsincereModel class here
# ...

# Save the trained model
def save_model(model, model_path):
    torch.save(model.state_dict(), model_path)

# Load the model
def load_model(model_architecture, model_path, device, hidden_dim, hidden_dim_fc, embedding_matrixs, vocab_size, embedding_dim, max_seq_len):
    model = model_architecture(
        device=device,
        hidden_dim=hidden_dim,
        hidden_dim_fc=hidden_dim_fc,
        dropout=dropout,
        embedding_matrixs=embedding_matrixs,
        vocab_size=vocab_size,
        embedding_dim=embedding_dim,
        max_seq_len=max_seq_len
    )
    model.load_state_dict(torch.load(model_path))
    return model

# Load the trained model
model_path = 'trained_model.pth'
loaded_model = load_model(InsincereModel, model_path, None, hidden_dim, hidden_dim_fc, embedding_matrixs, vocab_size, embed_size, max_seq_len)

# Save the loaded model (Optional)
# model_path = 'loaded_model.pth'
# save_model(loaded_model, model_path)
# print("Loaded model saved to:", model_path)

# Create the test dataset and DataLoader
test_dataset = TextDataset(test_df, vocab=shared_resources['vocab'], max_seq_len=max_seq_len)
tb = BucketSampler(test_dataset, test_dataset.get_keys(), batch_size=batch_size, shuffle_data=False)
test_iter = DataLoader(dataset=test_dataset, batch_size=batch_size, sampler=tb, num_workers=0, collate_fn=collate_fn)

# Make predictions on the test set using the loaded model
predictions_te = eval_model(loaded_model, test_iter, device)

# Calculate accuracy of the model
def calculate_accuracy(targets, predictions):
    return (targets == (predictions > threshold)).mean()

# Assuming you have the ground truth labels for the test set in a variable named 'test_labels'
test_labels = test_df['target'].values

accuracy = calculate_accuracy(test_labels, predictions_te)
print(f"Accuracy: {accuracy:.4f}")


NameError: name 'hidden_dim' is not defined