In [None]:
import numpy as np 
import pandas as pd 
import os
import torch
import time
import plotly.express as px


from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# py files for ru bert
from bert_dataset import CustomDataset
from bert_classifier import BertClassifier

# kaggle dirs
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# path to data
DATA_DIR = "/kaggle/input/sber-text-dataset/"

In [None]:
df = pd.read_csv(os.path.join(DATA_DIR, "train_data_preprocessed.csv"), index_col=0)
# df_val = pd.read_csv(os.path.join(DATA_DIR, "val_data_preprocessed.csv"), index_col=0) # val 50/50 label split

In [None]:
df = pd.concat([df, df_val]) # this code to recreate original df

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
df_train, df_val = train_test_split(df_train, test_size=0.2, random_state=42) # split with class disbalance

In [None]:
fig = px.histogram(df_test[['label']], x='label', text_auto=True)
fig.update_layout(bargap=0.3, xaxis={'tickmode':'linear'}, title='Test label distribution')

fig.show()

In [None]:
fig = px.histogram(df_train[['label']], x='label', text_auto=True)
fig.update_layout(bargap=0.3, xaxis={'tickmode':'linear'}, title='Train label distribution')

fig.show()

In [None]:
fig = px.histogram(df_val[['label']], x='label', text_auto=True)
fig.update_layout(bargap=0.3, xaxis={'tickmode':'linear'}, title='Val label distribution')

fig.show()

Possible class weight tuning

In [None]:
y = df['label'].values
class_weights=compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights=torch.tensor(class_weights,dtype=torch.float)

In [None]:
# class_weights = class_weights * torch.tensor((1, 2))
class_weights

In [None]:
train_start_time = int(time.time())

# lightweight
# model_path = 'cointegrated/rubert-tiny2',
# tokenizer_path = 'cointegrated/rubert-tiny2',

# deeppavlov
model_path = "DeepPavlov/rubert-base-cased-sentence"
tokenizer_path = "DeepPavlov/rubert-base-cased-sentence"

classifier = BertClassifier(
        model_path='cointegrated/rubert-tiny2',
        tokenizer_path='cointegrated/rubert-tiny2',
        n_classes=2,
        class_weights=class_weights,
        epochs=10,
        model_save_path=f'./bert_time_{train_start_time}.pt'
)

In [None]:
classifier.preparation(
        X_train=list(df_train['sentence_preprocess']),
        y_train=list(df_train['label']),
        X_valid=list(df_val['sentence_preprocess']),
        y_valid=list(df_val['label'])
    )

In [None]:
classifier.train()

Test

In [None]:
X_test = df_test['sentence_preprocess']
y_test = df_test['label']
pred = []

for text_item in X_test:
    pred.append(classifier.predict(text_item))

In [None]:
baseline_confusion_matrix = metrics.confusion_matrix(y_test, pred)
baseline_confusion_matrix

In [None]:
fig = px.imshow(baseline_confusion_matrix, text_auto=True, title='Baseline confusion matrix')
fig.update_layout(xaxis={'tickmode':'linear'}, yaxis={'tickmode':'linear'})
fig.show()

In [None]:
classifier_dict = {
    "test_size": len(X_test),
    'TP': baseline_confusion_matrix[1, 1],
    'TN': baseline_confusion_matrix[0, 0],
    'FP': baseline_confusion_matrix[0, 1],
    'FN': baseline_confusion_matrix[1, 0],
    'precision': metrics.precision_score(y_test, pred),
    'recall': metrics.recall_score(y_test, pred),
    'accuracy': metrics.accuracy_score(y_test, pred),
    'F1': metrics.f1_score(y_test, pred,)
}
classifier_dict

### Possible inference

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
path_to_model = './bert_time_1668629455.pt'

model_classifier = torch.load(path_to_model).to(device)
model_classifier.eval()
tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny2')

In [None]:
def inference_predict(model, tokenizer, text, device):
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    model_pred = model_classifier(torch.tensor([input_ids]).to(device))
    model_pred_label = torch.argmax(model_pred.logits, dim=1).cpu().numpy()[0]
    
    return model_pred_label

In [None]:
custom_text = df_test['sentence'].values[3] # df_test['label'].values[3] == 1
inference_predict(model_classifier, tokenizer, custom_text, device)

In [None]:
custom_text = df_test['sentence'].values[0] #  df_test['label'].values[0] == 0
inference_predict(model_classifier, tokenizer, custom_text, device)

Adding predict row to test_df

In [None]:
df_output = pd.read_csv('/kaggle/input/sber-text-dataset/test_data.csv', index_col=0)

In [None]:
df_output_preprocessed = pd.read_csv('/kaggle/input/sber-text-dataset/test_data_preprocessed.csv', index_col=0)

In [None]:
df_output.info()

In [None]:
X_test = df_output_preprocessed['sentence_preprocess']
pred = []

for text_item in X_test:
    pred.append(inference_predict(model_classifier, tokenizer, text_item, device))

In [None]:
np.unique(pred, return_counts=True)

In [None]:
df_output['prediction'] = pred

In [None]:
df_output.to_csv('./rubert-tiny2_v3_tag_1668632322_predict.csv')

In [None]:
df_output[df_output['prediction'] == 1]['title'].values