In [None]:
#libs import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import re
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv(r'../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
#test_labels = pd.read_csv(r'/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [None]:
#data = data.sample(frac=1).reset_index(drop=True)

In [None]:
# data_targets = data.drop('id', axis = 1).drop('comment_text', axis = 1)
# for target in data_targets:
#     print(data[target].value_counts())

The dataset is highly unbalanced.
Now we want to see if one comment can belong to more than one class.

In [None]:
# data.corr().style.background_gradient(cmap='coolwarm', low=0.15)

We see that all the classes overlap more or less. the biggest overlaps are between "insult" and "obscene", "toxic" and "obscene", "toxic" and "insult". Threat has the least correlation to other features. Probably people tend to threaten each other in a very polite way. So it's a multiclass classification task with some of the classes overlapping.

In [None]:
# import seaborn as sns

# multitagging = data_targets.sum(axis = 1).value_counts()
# sns.barplot(multitagging.index, multitagging.values, alpha=0.8,color='green')

Most comments are clean, and the maximum amount of tags for one post is 6. So it's possible for one post to be in all of the groups at the same time. But those are a minority, plus we must take into consideration the fact that the data gathered is flawed because it was based on human reports.

Now to get some features we'll look at the average character count, word count and punctuation for every type of comment, take a look at the most frequently used words by comment type, and engineer some n-grams.

In [None]:
# toxic_comments = data[data['toxic'] == 1]
# severe_toxic_comments = data[data['severe_toxic'] == 1]
# obscene_comments = data[data['obscene'] == 1]
# threat_comments = data[data['threat'] == 1]
# insult_comments = data[data['insult'] == 1]
# identity_hate_comments = data[data['identity_hate'] == 1]

# clean_comments = data[(data['toxic'] == 0) &
#                                 (data['severe_toxic'] == 0) &
#                                 (data['obscene'] == 0) &
#                                 (data['threat'] == 0) &
#                                 (data['insult'] == 0) &
#                                 (data['identity_hate'] == 0)]

# comment_types = [toxic_comments, severe_toxic_comments, obscene_comments, threat_comments, insult_comments, identity_hate_comments, clean_comments]
# comment_types_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'clean']
# print()

In [None]:
# lengthByCommentType = []

# length_index = 0
# for comment_type in comment_types:
#     print
#     lengthByCommentType.append(comment_type['comment_text'].str.len().mean())        

# print(lengthByCommentType)
# plt.plot(lengthByCommentType)
# #commentLengths.sort()
# #plt.plot(commentLengths)

As we see, total length of the comment can probably be a useful feature as the meanletter counts differ alot. Toxic comments on average have much lower letter count than clean ones, but especially passionate shitshtorms exceed even clean comments in their letter counts. However, comment lengths themselves are not balanced at all, with some extremely short and extremely long ones in every group.

In [None]:
# import collections
# from heapq import nlargest

# i = 0
# for comment_type in comment_types:
#     amountByCommentType = {}
#     print("Dictionary for " + comment_types_names[i] + ":")
#     i += 1
    
#     for comment in comment_type:
#         commentString = comment_type['comment_text']
#         commentString_text = commentString.str.split()        
#         for stringArray in commentString_text:
#             #commentLengths.append(len(stringArray))
#             for word in stringArray:
#                 if word in amountByCommentType:
#                     amountByCommentType[word] += 1
#                 else:
#                     amountByCommentType[word] = 1
#     TwentyHighest = nlargest(20, amountByCommentType, key = amountByCommentType.get) 
#     for val in TwentyHighest: 
#         print(val, ":", amountByCommentType.get(val)) 
                
#     #amountByCommentType.append(i/len(comment_type))
    
# #print(amountByCommentType)
# #plt.plot(amountByCommentType)

Now we preprocess the data. Stipping all the comments off html tags, punctuation and then stemming the words

In [None]:
def count_regex(regexp = "", comment = None):    
    return len(re.findall(regexp, comment))

In [None]:
#creating new columns of features for the dataset
def PrepareText(data):
    #Comment's total length
    data['total_length_count'] = data['comment_text'].apply(len)
    #Amount of symbols
    data['symbols_count'] = data['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    #Exclamation mark count
    data['exclamation_count'] = data['comment_text'].apply(lambda comment: comment.count('!')) 
    #Question mark count
    data['question_count'] = data['comment_text'].apply(lambda comment: comment.count('?'))
    #Punctuation count
    data['punctuation_count'] = data['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    #Amount of upper case letters
    data['uppercase_amount'] = data['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    #Amount of upper case letters compared to the text's length
    data['FULLCAPS_COUNT'] = data['uppercase_amount']/data['total_length_count']
    #Amount of unique words compared to total word count
    data['total_unique_words_count'] = data['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    #Amount of ". [upper_case_letter]" constructions
    data['polite_sentence_count'] = data['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '\.\s[A-Z]'))
    #Amount of f??k
    data['fuck_count'] = data['comment_text'].apply(lambda comment: count_regex(r'[Ff]\S{2}[Kk]', comment))
    #Amount of 'you'
    data['you_count'] = data['comment_text'].str.lower().apply(lambda comment: count_regex(r'you', comment) + count_regex(r' u ', comment))
    #Amount of 'we' and 'I'
    data['wei_count'] = data['comment_text'].str.lower().apply(lambda comment: comment.count(' we ') + comment.count(' i ')) 

    #Amount of racial slurs and the like
    racial_slurs = ['gyp', 'slav', 'jew', 'yid', 'kike',  'goy', 'gentile',
        'skinhead', 'anti', 'na+zi', 'kurd', 'turk', 'nationalis',
        r'fa+t', r'whi+te', 'cracker', 'racis', r'spi+c', r'beane+r',
        r'coo+n', 'fasc', 'homo', 'negr', 'akba+r', r'alla+h', r'chi+nk',
        r'goo+k', 'nigg', r'kaf+', 'kebab', r'ni+p', 'islam', 'muslim',
        'raghead', 'towelhead']

    for slur in racial_slurs:
        if 'racial_slur_count' in data.columns:
            data['racial_slur_count'] = data['racial_slur_count'] + data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))
        else:
            data['racial_slur_count'] = 0         
            data['racial_slur_count'] = data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))
    
    #Amout of slurs by gender or sexual orientation
    LGBT_slurs = [r'ga+y', r'f[a-z]gg[a-z]t', r'fa+g', 'trans', 'lgbt', 'bugg',
                            'fudgep', 'siss', 'marimach', 'nancy', 'batty',
                            'twink', 'dyke', 'lesb', 'trann', 'shemal', 'quean', 'breeder']
    
    for slur in LGBT_slurs:
        if 'LGBT_slur_count' not in data.columns:
            data['LGBT_slur_count'] = 0
            data['LGBT_slur_count'] = data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))
        else:    
            data['LGBT_slur_count'] = data['LGBT_slur_count'] + data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))                                                                          
                                                  
    #Amount of possibly obscene words
    obscene_words = ['sex', 'whore', 'shit', 'piss', 'bastard', 'rape', 'sodom', 'cock', 'dick']  

    for word in obscene_words:
        if 'obscene_word_count' not in data.columns:
            data['obscene_word_count'] = 0
            data['obscene_word_count'] = data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))
        else:
            data['obscene_word_count'] = data['obscene_word_count'] + data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))
                                                                   
    #Amount of threatening words
    threat_words = ['kill', 'murder', 'hate', 'die']
                                                                             
    for word in threat_words:
        if 'threat_words_count' not in data.columns:
            data['threat_words_count'] = 0
            data['threat_words_countt'] = data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))
        else:
            data['threat_words_count'] = data['threat_words_count'] + data['comment_text'].str.lower().apply(lambda comment: count_regex(slur, comment))

In [None]:
PrepareText(data)

In [None]:
feature_columns = ['comment_text', 'total_length_count', 'symbols_count', 'exclamation_count',
                   'question_count', 'punctuation_count', 'uppercase_amount', 'FULLCAPS_COUNT',
                   'total_unique_words_count', 'polite_sentence_count', 'fuck_count', 'you_count',
                   'wei_count', 'racial_slur_count', 'LGBT_slur_count', 'obscene_word_count', 'threat_words_count']

numerical_features = ['total_length_count', 'symbols_count', 'exclamation_count',
                   'question_count', 'punctuation_count', 'uppercase_amount', 'FULLCAPS_COUNT',
                   'total_unique_words_count', 'polite_sentence_count', 'fuck_count', 'you_count',
                   'wei_count', 'racial_slur_count', 'LGBT_slur_count', 'obscene_word_count', 'threat_words_count']

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
def clean_text(comment):    
    comment = re.sub(r" yourselfgo ", " yourself go ", comment)
    comment = re.sub(r" fucksex ", " fuck sex ", comment)
    comment = re.sub(r" u ", " you ", comment)
    comment = re.sub(r"what's", "what is ", comment)
    comment = re.sub(r"\'s", " ", comment)
    comment = re.sub(r"\'ve", " have ", comment)
    comment = re.sub(r"can't", "can not ", comment)
    comment = re.sub(r"n't", " not ", comment)
    comment = re.sub(r"i'm", "i am ", comment)
    comment = re.sub(r"\'re", " are ", comment)
    comment = re.sub(r"\'d", " would ", comment)
    comment = re.sub(r"\'ll", " will ", comment)
    comment = re.sub(r"\'scuse", " excuse ", comment)
    comment = re.sub('\W', ' ', comment)
    comment = re.sub('\s+', ' ', comment)
    comment_clean = comment.strip(' ')
    return comment_clean

def cleanHtml(comment):
    cleanr = re.compile('<.*?>')
    no_html = re.sub(cleanr, ' ', str(comment))
    return no_html

def cleanPunc(comment):
    no_punctuation = re.sub(r'[?|!|\'|"|#]',r'',comment)
    no_punctuation = re.sub(r'[.|,|)|(|\|/]',r' ',comment)
    no_punctuation= no_punctuation.strip()
    no_punctuation = no_punctuation.replace("\n"," ")
    return no_punctuation

def keepAlpha(comment):
    alpha_sent = ""
    for word in comment.split():
        alpha_word = re.sub('[^a-z A-Z]+', ' ', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

data['comment_text'] = data['comment_text'].str.lower()
data['comment_text'] = data['comment_text'].apply(clean_text)
data['comment_text'] = data['comment_text'].apply(cleanHtml)
data['comment_text'] = data['comment_text'].apply(cleanPunc)
data['comment_text'] = data['comment_text'].apply(keepAlpha)

In [None]:
!pip install transformers

In [None]:
import torch
import transformers as ppb
config_class, model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertConfig, ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [None]:
if torch.cuda.is_available():      
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
print('Initializing tokenizer')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

In [None]:
comments = data.comment_text.values
labels_l = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.astype(float)

In [None]:
def encode(comments):
    print('Encoding comments')
    input_ids_l = []
    attention_masks_l = []
    for comment in comments:  
        encoded_dict = tokenizer.encode_plus(
                            comment,
                            add_special_tokens = True,
                            max_length = 512,
                            pad_to_max_length = True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )     
        input_ids_l.append(encoded_dict['input_ids'])    
        attention_masks_l.append(encoded_dict['attention_mask'])        
    return input_ids_l, attention_masks_l

def data_preparation(input_ids_l, attention_masks_l):
    print('Data preparation')
    with torch.cuda.device(0):
        input_ids = torch.cat(input_ids_l, dim=0).to(device='cuda')
        attention_masks = torch.cat(attention_masks_l, dim=0).to(device='cuda')
        labels = torch.tensor(labels_l).to(device='cuda')
    return input_ids, attention_masks

def get_lhs(input_ids, attention_masks, model):
    print('Getting last hidden states')
    last_hidden_states_l = []
    j = 0
    for item in range(500000):
        if len(input_ids)-(j+50) < 0:
            break
        print('j = ' + str(j))
        if len(input_ids)-(j+50) > 50:
            print('Calculating samples: ' + str(j) + ' - ' + str(j+49))
            input_id = input_ids[j:j+50,:]        
            attention_mask = attention_masks[j:j+50,:]
            j+=50
        else:                    
            print('Calculating last samples')
            input_id = input_ids[j:,:]    
            attention_mask = attention_masks[j:,:]
            j+=50
        with torch.no_grad():
            last_hidden_states = model(input_id, attention_mask=attention_mask)
            
        #last_hidden_states_l.append(last_hidden_states[0][:,0,:].cpu().numpy())
        #last_hidden_states_l.append(hidden_states[2][:,-2].cpu().numpy())
        
        last_hidden_states_l.append(last_hidden_states.cpu().numpy())
    return last_hidden_states_l

In [None]:
input_ids_l, attention_masks_l = encode(comments)
input_ids, attention_masks = data_preparation(input_ids_l, attention_masks_l)
labels = torch.from_numpy(labels_l)

In [None]:
# from torch.utils.data import TensorDataset, random_split

# dataset = TensorDataset(input_ids, attention_masks, labels)

# train_size = int(0.9 * len(dataset))
# valid_size = len(dataset) - train_size

# train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

# print('{:>5,} training samples'.format(train_size))
# print('{:>5,} validation samples'.format(valid_size))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(input_ids, labels_l, test_size=0.1, random_state=1488)

In [None]:
from torch.utils.data import Dataset, DataLoader
class text_dataset(Dataset):
    def __init__(self,x,y, transform=None):
        self.x = x
        self.y = y
        self.transform = transform
        
    def __getitem__(self,index):
        ids_review = self.x[index]
        hcc = self.y[index] # toxic comment        
        list_of_labels = [torch.from_numpy(hcc)] 
        return ids_review, list_of_labels[0]

    def __len__(self):
        return len(self.x)

In [None]:
training_dataset = text_dataset(X_train,y_train)
valid_dataset = text_dataset(X_valid,y_valid)

batch_size = 16

dataloaders_dict = {'train': DataLoader(training_dataset, batch_size=batch_size, shuffle=False),
                   'val': DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
                   }
dataset_sizes = {'train':len(X_train),
                'val':len(X_valid)}

In [None]:
import torch.nn as nn
class DistilBertClassifier(torch.nn.Module):
   def __init__(self, config):
        super().__init__()
        self.distilbert = model_class.from_pretrained(pretrained_weights)
        self.num_labels = config.num_labels        
        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = nn.Dropout(config.seq_classif_dropout)
        self.output_hidden_states = config.output_hidden_states

        nn.init.xavier_normal_(self.classifier.weight)

   def forward(self, input_ids=None, attention_mask=None, labels=None):
      distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
      hidden_state = distilbert_output[0]
      pooled_output = hidden_state[:, 0]
      pooled_output = self.pre_classifier(pooled_output)
      pooled_output = nn.ReLU()(pooled_output)
      pooled_output = self.dropout(pooled_output)
      logits = self.classifier(pooled_output)
      return logits

print('Initializing model')
config = config_class(vocab_size_or_config_json_file=32000, dropout=0.1, num_labels=6, intermediate_size=3072)
model = DistilBertClassifier(config)
model.cuda()

In [None]:
# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# batch_size = 16

# train_dataloader = DataLoader(
#             train_dataset,
#             sampler = RandomSampler(train_dataset),
#             batch_size = batch_size
#         )

# valid_dataloader = DataLoader(
#             valid_dataset,
#             sampler = SequentialSampler(valid_dataset),
#             batch_size = batch_size
#         )


In [None]:
from transformers import AdamW
from torch.optim import lr_scheduler

epochs = 3
lrlast = .001
lrmain = 3e-5

optimizer_ft = AdamW(model.parameters(),
                  lr = 5e-5,
                  eps = 1e-8
                  )

criterion = nn.BCEWithLogitsLoss()

exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)

In [None]:
def accuracy_thresh(y_pred, y_true, thresh:float=0.4, sigmoid:bool=True):    
    if sigmoid: y_pred = y_pred.sigmoid()
    return np.mean(((y_pred>thresh).float()==y_true.float()).float().cpu().numpy(), axis=1).sum()

In [None]:
import time
import datetime

In [None]:
import gc
gc.collect()

In [None]:
import copy
def train_model(model, criterion, optimizer, scheduler, num_epochs=2):
    model.train()
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)
        
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()
            else:
                model.eval()

            running_loss = 0.0            
            beta_score_accuracy = 0.0            
            micro_roc_auc_acc = 0.0
                       
            for inputs, hcc in dataloaders_dict[phase]:
                
                inputs = inputs.to(device) 
                hcc = hcc.to(device)            
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)                    
                    loss = criterion(outputs,hcc.float())
                    
                    if phase == 'train':                        
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                
                micro_roc_auc_acc +=  accuracy_thresh(outputs.view(-1,6),hcc.view(-1,6))
                
            epoch_loss = running_loss / dataset_sizes[phase]

            
            epoch_micro_roc_acc = micro_roc_auc_acc / dataset_sizes[phase]

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} micro_roc_auc_acc: {:.4f}'.format( phase, epoch_micro_roc_acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'distilbert_model_weights.pth')
         

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_loss)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
 
print('done')

In [None]:
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,num_epochs=2)

In [None]:
last_hidden_states_l = get_lhs(input_ids, attention_masks, model_ft1111)

In [None]:
features = np.concatenate(last_hidden_states_l)
labels_lgbm = data[class_names]
features_r = np.append(features, data[numerical_features].to_numpy(), axis=1)

In [None]:
import lightgbm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split

def Classification(xtrainlgbmc, ytrainlgbmc, params):
    model_dict = {}
                  
    for target in class_names:    
        print('Predicting {0} comments:'.format(target))
        
        train_matrix, valid_matrix, y_train, y_valid = train_test_split(xtrainlgbmc, ytrainlgbmc, test_size = 0.1, random_state = 42)        
        d_train = lightgbm.Dataset(train_matrix, label = y_train[target])
        d_valid = lightgbm.Dataset(valid_matrix, label = y_valid[target])
        valid = [d_train, d_valid]
    
        lgbmc = lightgbm.train(params = params, train_set = d_train, valid_sets = valid, verbose_eval = 10, num_boost_round = 10000, early_stopping_rounds = 150)
                
        #y_predlgbmc = lgbmc.predict(xtestlgbmc)
        y_predlgbmc_train = lgbmc.predict(xtrainlgbmc)
        print('Roc_auc for training set is: %.4f' % roc_auc_score(ytrainlgbmc[target], y_predlgbmc_train))        
        #submission[target] = y_predlgbmc
        model_dict[target] = lgbmc

    return model_dict

In [None]:
LGBMparameters = {
                  'learning_rate': 0.05,
                  'application': 'binary',                  
                  'max_depth' : 6,
                  'num_leaves' : 15,
                  'verbosity': -1,
                  'n_thread' : 2,
                  'metric': 'auc',                  
                  'lambda_l1': 5,
                  'lambda_l2': 5
                 }

modellos = Classification(features_r, labels_lgbm, LGBMparameters)

In [None]:
print(modellos)

In [None]:
for name, model in modellos.items():
  model.save_model('model{0}.txt'.format(name))

In [None]:
test_data = pd.read_csv(r'../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_comments = test_data.comment_text.values

In [None]:
PrepareText(test_data)

In [None]:
test_data['comment_text'] = test_data['comment_text'].str.lower()
test_data['comment_text'] = test_data['comment_text'].apply(clean_text)
test_data['comment_text'] = test_data['comment_text'].apply(cleanHtml)
test_data['comment_text'] = test_data['comment_text'].apply(cleanPunc)
test_data['comment_text'] = test_data['comment_text'].apply(keepAlpha)

In [None]:
test_input_ids_l, test_attention_masks_l = encode(test_comments)
test_input_ids, test_attention_masks = data_preparation(test_input_ids_l, test_attention_masks_l)
#test_last_hidden_states_l = get_lhs(test_input_ids, test_attention_masks, model_ft1)

In [None]:
def get_lhs_dbert(input_ids, attention_masks, model):
    print('Getting last hidden states')
    last_hidden_states_l = []
    j = 0
    for item in range(500000):
        if len(input_ids)-(j+5) < 0:
            break
        print('j = ' + str(j))
        if len(input_ids)-(j+5) > 10:
            print('Calculating samples: ' + str(j) + ' - ' + str(j+4))
            input_id = input_ids[j:j+5,:]        
            attention_mask = attention_masks[j:j+5,:]
            j+=5
        else:                    
            print('Calculating last samples')
            input_id = input_ids[j:,:]    
            attention_mask = attention_masks[j:,:]
            j+=5
        with torch.no_grad():
            last_hidden_states = model(input_id, attention_mask=attention_mask)            
            
        last_hidden_states_l.append(last_hidden_states[0][:,0,:].cpu().numpy())
        #last_hidden_states_l.append(hidden_states[2][:,-2].cpu().numpy())      
        
    return last_hidden_states_l

In [None]:
del model
gc.collect

In [None]:
lhs_distilbert = get_lhs_dbert(test_input_ids, test_attention_masks, model_ft1.distilbert)

In [None]:
torch.cuda.empty_cache()

In [None]:
import gc
gc.collect

In [None]:
test_features = np.concatenate(test_last_hidden_states_l)
test_features_r = np.append(test_features, test_data[numerical_features].to_numpy(), axis=1)

In [None]:
submission = pd.DataFrame.from_dict({'id': test_data['id']})

In [None]:
for target in class_names:    
        print('Predicting {0} comments:'.format(target))
        modello = modellos[target]
        y_predlgbmc = modello.predict(test_features_r)                 
        submission[target] = y_predlgbmc

In [None]:
print(submission)

In [None]:
submission.to_csv('lgb_bert_submission1.csv', index=False)

Источники:
* https://mccormickml.com/2019/07/22/BERT-fine-tuning/#33-tokenize-dataset
* https://arxiv.org/abs/1706.03762
* https://medium.com/@armandj.olivares/using-bert-for-classifying-documents-with-long-texts-5c3e7b04573d
* http://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/
* ноутбуки по той же задаче

(ГПУ отлетело в момент предыдущего ковыряния бука, потому возможны косяки при получении last hidden states)