In [None]:
%load_ext autotime
%matplotlib inline

In [None]:
version = 'v1_train_v1_test_v1_clickhouse_ru_repeat'
description = 'Without delimeters. Train is unbalanced.'

dataset_name = 'clickhouse_ru.labelled.combined'
# dataset_name = 'ru_python_2018-2019.labelled.combined'
# dataset_name = 'kinota1k.labelled.combined'

original_dataset_dir = './datasets/'

BASE_PATH = './models/bertForSeqClassification/' + version + '/'

model_path = BASE_PATH + 'model/'
# dataset_path = BASE_PATH + 'dataset/'
dataset_path = original_dataset_dir
cross_validation_path = BASE_PATH + 'cross_validation/'

train_directory = ""
test_directory = ""
train_file_name = f"{dataset_name}.restore_dialogs.bert_train_v1.csv"
test_file_name = f"{dataset_name}.restore_dialogs.test.csv"

train_path = dataset_path + train_file_name
test_path = dataset_path + test_file_name
train_tokenized_path = train_path[:-4] + '.tokenized.csv'

should_tokenize_train = True
# should_tokenize_train = False

In [None]:
! mkdir $BASE_PATH
! mkdir $model_path
! mkdir $dataset_path
! mkdir $cross_validation_path
! cp $original_dataset_dir$train_directory$train_file_name $train_path
! cp $original_dataset_dir$test_directory$test_file_name $test_path
! ls -lah $dataset_path
! echo $description > $BASE_PATH"readme.txt"

In [None]:
! nvidia-smi

In [None]:
import pandas as pd
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"
import matplotlib.pyplot as plt

import tqdm
tqdm.tqdm_notebook().pandas()

import torch
from transformers import *

from sklearn.utils import shuffle
import ast

from metrics import *
from prepare import *
from utils import *

In [None]:
def balance(df):
    df0 = df[df['label'] == 0]
    df1 = df[df['label'] == 1]
    min_size = min(df0.shape[0], df1.shape[0])
    return shuffle(pd.concat([df0.sample(min_size), df1.sample(min_size)])).reset_index()

In [None]:
if should_tokenize_train:
    train_df = pd.read_csv(train_path)
    print("Size before balancing", train_df.shape[0])
    train_df =  balance(train_df)
    print("Size after balancing", train_df.shape[0])
    
    train_df['message1'] = train_df['message1']\
        .fillna('')\
        .apply(lambda text: text.lower())\
        .progress_apply(prepare_special_tokens)
    train_df['message2'] = train_df['message2']\
        .fillna('')\
        .apply(lambda text: text.lower())\
        .progress_apply(prepare_special_tokens)
    
    if 'new_text_with_SEP_tag' not in train_df.columns:
        train_df['new_text_with_SEP_tag'] = train_df.progress_apply(lambda row: join_sep(row['message1'], row['message2']), axis=1)
    else:
        train_df['new_text_with_SEP_tag'] = train_df['new_text_with_SEP_tag']\
            .apply(lambda text: text.lower())\
            .progress_apply(prepare_special_tokens)
    train_df = shuffle(train_df)
    verify_dataset(train_df)

    print("Speakers:", get_speakers_number(train_df))
    train_df.head()

In [None]:
test_df = pd.read_csv(test_path)
test_df.columns

In [None]:
test_df.head(10)

In [None]:
if 'new_text_with_SEP_tag' not in test_df.columns:
    test_df['message1'] = test_df['message1']\
        .fillna('')\
        .apply(lambda text: text.lower())
    test_df['message2'] = test_df['message2']\
        .fillna('')\
        .apply(lambda text: text.lower())
    test_df['new_text_with_SEP_tag'] = test_df.progress_apply(lambda row: join_sep(row['message1'], row['message2']), axis=1)


test_df['new_text_with_SEP_tag'] = test_df['new_text_with_SEP_tag']\
        .apply(lambda text: text.lower())\
        .progress_apply(prepare_special_tokens)


print('Speakers', get_speakers_number(test_df))
test_df.head()

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_basic_tokenize=False)

assert(len(tokenizer.encode('[unused98]')) == 3)
assert(len(tokenizer.encode('[unused99]')) == 3)

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
train_df.tail(10)

In [None]:
test_df.head(10)
# [['new_text_with_SEP_tag']]

In [None]:
# tokenizer.encode('[JOIN]')
# tokenizer.encode('[join]')

In [None]:
if should_tokenize_train:
    train_df['tokenized_text'] = train_df['new_text_with_SEP_tag'].progress_apply(tokenizer.encode)
    train_df.to_csv(train_tokenized_path, index=False)
else:
    train_df = pd.read_csv(train_tokenized_path)
    train_df['tokenized_text'] = train_df['tokenized_text'].progress_apply(lambda x: ast.literal_eval(x))
    train_df.head()

In [None]:
# Сколько пар сообщений по длине оказались меньше
(train_df['tokenized_text'].apply(len) <= 512).value_counts()

In [None]:
# Количество позитивов и негативов в тренировочном сете
train_df.label.value_counts()

In [None]:
test_df['tokenized_text'] = test_df['new_text_with_SEP_tag'].progress_apply(tokenizer.encode)

In [None]:
# Количество позитивов и негативов в тренировочном сете
test_df.label.value_counts()

In [None]:
train_df[['new_text_with_SEP_tag', 'tokenized_text']].head(5)

In [None]:
test_df[['new_text_with_SEP_tag', 'tokenized_text']].head(5)

In [None]:
tokenizer.encode(list(test_df['new_text_with_SEP_tag'])[0])

In [None]:
list(test_df['new_text_with_SEP_tag'])[0]

In [None]:
arr = tokenizer.encode(list(test_df['new_text_with_SEP_tag'])[0].lower())

[tokenizer.convert_ids_to_tokens(x) for x in arr]

In [None]:
def get_metrics(df_test):
    df_test['pred_label'] = 0
    _dict = {
        'border_of_prob': [],
        'recall': [],
        'precision': [],
        'f1': [],
        'accuracy': [],
        'percents_of_positive_preds': []
    }
    for i in range(1, 100):
        try:
            b = 0.01*i
            df_test.loc[df_test.probs > b, 'pred_label'] = 1

            num_tp = df_test[(df_test.label == df_test.pred_label) & (df_test.label == 1)].shape[0]
            num_pos = df_test[df_test.label == 1].shape[0]
            num_pred_pos = df_test[df_test.pred_label == 1].shape[0]

            recall = num_tp / num_pos
            precision = num_tp / num_pred_pos
            accuracy = df_test[(df_test.label == df_test.pred_label) ].shape[0]/df_test.shape[0]

            f1 = 2*recall*precision/(precision + recall)

            _dict['border_of_prob'].append(b)
            _dict['recall'].append(recall)
            _dict['precision'].append(precision)
            _dict['f1'].append(f1)
            _dict['accuracy'].append(accuracy)
            _dict['percents_of_positive_preds'].append(df_test.pred_label.value_counts()[1]/df_test.shape[0])
            df_test['pred_label'] = 0
        except:
            pass
    
    df = pd.DataFrame(_dict)
    df.index = df['border_of_prob']
    
    return df

In [None]:
def make_prediction(model, df, batch_size):
    df['probs'] = 0
    with torch.no_grad():
        # Adding SoftMax for correct interpretation
        sm = torch.nn.Softmax(dim=1)
        model.eval()
        n_batches = df.shape[0] // batch_size
        for i in tqdm.notebook.tqdm(range(n_batches)):
            batch_df = df.iloc[i * batch_size: (i + 1) * batch_size]
            input_ids = batch_df['tokenized_text'].tolist()
            max_length = 512
            input_ids = np.array([row + [0] * (max_length - len(row)) if len(row) < max_length 
                                  else row[len(row) - max_length:] 
                                  for row in input_ids])
            input_ids = torch.tensor(input_ids).cuda()
            outputs = model(input_ids)
            df.loc[batch_df.index ,'probs'] = sm(outputs[0].cpu())[:, 1].numpy()
    return df


def make_train(model, train_df, val_df, directory, batch_size=80):
    n_epochs = 2
    n_batches = train_df.shape[0] // batch_size

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    loss = torch.nn.CrossEntropyLoss()

    torch.cuda.empty_cache()
    loss_list = []

    save_best_model = True
    test_every_N_steps = 200
    max_length = 512
    max_f1_list = []

    for _ in range(n_epochs):
        for i in tqdm.notebook.tqdm(range(n_batches)):
            optimizer.zero_grad()
            batch = train_df.iloc[i * batch_size: (i + 1) * batch_size]
            input_ids = batch['tokenized_text'].tolist()
            input_ids = np.array([row + [0] * (max_length - len(row)) if len(row) < max_length 
                                  else row[len(row) - max_length:] 
                                  for row in input_ids])
            input_ids = torch.tensor(input_ids).cuda()

            outputs = model(input_ids)
            labels = torch.tensor(batch['label'].tolist()).cuda()
            losses = loss(outputs[0], labels)
            loss_list.append(losses.item())

            losses.backward()
            optimizer.step()

            # Validation and saving            
            if i % test_every_N_steps == 0:
                val_df = make_prediction(model, val_df, batch_size)
                metrics = get_metrics(val_df)
                print(metrics.f1.max())
                if not max_f1_list or max(max_f1_list) < metrics.f1.max():
                    torch.save(model.state_dict(), directory + "model.pt")
                    print('Saved')
                max_f1_list.append(metrics.f1.max())
    if len(max_f1_list) == 0:
        torch.save(model.state_dict(), directory + "model.pt")
        print('Saved')
    return loss_list, max_f1_list

### cross_val_predict

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
DEVICES_NUMBER = len(os.environ["CUDA_VISIBLE_DEVICES"].split(','))

In [None]:
def cross_val_predict(global_train_df, global_test_df, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    splits = skf.split(global_train_df, global_train_df['label'])
    for index, (train_index, test_index) in enumerate(splits):
        print('---------------- SPLIT ' + str(index + 1) + ' ----------------')
        directory = cross_validation_path + str(index + 1) + '/'
        create_directory_if_not_exist(directory)

        train_df = global_train_df.iloc[train_index]
        train_df, val_df = train_test_split(train_df, test_size=0.2)
        
        train_df.to_csv(directory + 'train.csv', index=False)
        val_df.to_csv(directory + 'val.csv', index=False)
        test_df = global_train_df.iloc[test_index]
        test_df.to_csv(directory + 'test.csv', index=False)
        print('Train shape:', train_df.shape, 
              'Val shape:', val_df.shape,
              'Test shape:', test_df.shape)

        model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased')
        model.to('cuda')
        model = torch.nn.DataParallel(model, device_ids=list(range(DEVICES_NUMBER)))

        loss_list, max_f1_list = make_train(model, train_df, val_df, directory, batch_size=30)
#         plt.plot(loss_list)
#         plt.plot(max_f1_list)
        
        model.load_state_dict(torch.load(directory + "model.pt"))

        val_df = make_prediction(model, val_df, batch_size=128)
        val_df = calculate_maxprobs(val_df)
        val_df.to_csv(directory + 'val.probs.csv', index=False)

        test_df = make_prediction(model, test_df, batch_size=128)
        test_df = calculate_maxprobs(test_df)
        test_df.to_csv(directory + 'test.probs.csv', index=False)

        global_test_df = make_prediction(model, global_test_df, batch_size=128)
        global_test_df = calculate_maxprobs(global_test_df)
        global_test_df.to_csv(directory + 'global_test.probs.csv', index=False)
        
        del model
        
        calculate_all_cv_metrics(directory, val_df, test_df, global_test_df)
        
        print()
        print()

In [None]:
! nvidia-smi

In [None]:
# train_df['label'].value_counts()
test_df['label'].value_counts()

In [None]:
cross_val_predict(train_df, test_df)

In [None]:
_ = averaged_metrics_cv(cross_validation_path, 'test.metrics_by_all.unbalanced')
print()
print()
print()
_ = averaged_metrics_cv(cross_validation_path, 'global_test.metrics_by_all.unbalanced')

In [None]:
_ = averaged_metrics_cv(cross_validation_path, 'test.metrics_by_all.balanced')
print()
print()
print()
_ = averaged_metrics_cv(cross_validation_path, 'global_test.metrics_by_all.balanced')

In [None]:
_ = averaged_metrics_cv(cross_validation_path, 'test.metrics_by_max.unbalanced')
print()
print()
print()
_ = averaged_metrics_cv(cross_validation_path, 'global_test.metrics_by_max.unbalanced')

In [None]:
_ = averaged_metrics_cv(cross_validation_path, 'test.metrics_by_max.balanced')
print()
print()
print()
_ = averaged_metrics_cv(cross_validation_path, 'global_test.metrics_by_max.balanced')