In [None]:
import numpy as np
import pandas as pd
#from nltk import word_tokenize, sent_tokenize
from collections import defaultdict
import string
import re
import os

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/My Drive/'
path_model='/content/drive/My Drive/rubert'
path_dataset = os.path.join(path, "dataset")
path_data = os.path.join(path, "data")

In [None]:
import re
def preprocess_text_tags(text):
    if isinstance(text, (int, float)):
      return ''
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(http[s]?://[^\s]+))','url', text)     
    text = re.sub('@[^\s]+','user', text)
    text = re.sub('\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,4}','email', text)   
    text = re.sub('(?:\#+[\w_]+[\w\'_\-]*[\w_]+)','hashtag', text)
    text = re.sub('(?:(?:\d+,?)+(?:\.?\d+)?)','num', text)
    #text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return ' '.join(text.split()) 

In [None]:
#stops = set(stopwords.words("russian"))

def clean_text(data):
    delstops = True
    simple_filter = True
#    del12gram = True  # True = убираем обрывки слов в 1-2 символа
        
 #   if delstops:
 #       data = " ".join([w for w in data.split() if w not in stops])
    
    if simple_filter:
        data = preprocess_text_tags(data)
    
#    if del12gram:
#        data = " ".join([w for w in data.split() if len(w) > 2])
    return data

In [None]:
def tag2num(tag):
    """
    document convert to vector (sum of token)
    """
    ans=0
    if tag=='PSTV':
        ans=1
    elif tag=='NGTV':
        ans=-1
    return ans

In [None]:
def toxic2num(tag):
    ans=0
    if tag==1.0:
        ans=0
    elif tag==0.0:
        ans=1
    return ans

In [None]:
def toxic_dataset():
    path_no1 = os.path.join(path_dataset, 'toxic.csv')
    df = pd.read_csv(path_no1)      
    df['clear_text'] = df['comment'].map(clean_text)
    df['tone'] = df['toxic'].apply(toxic2num)
    df=df[['clear_text','tone']]
    return df

In [None]:
def tone_dataset():
    path_no1 = os.path.join(path_dataset, 'tone_unfiltered.csv')
    df = pd.read_csv(path_no1)
    df['clear_text'] = df['text'].map(clean_text)
    return df

In [None]:
def marked_dataset():
    path_no1 = os.path.join(path_data, 'marked_tonal.csv')
    df = pd.read_csv(path_no1)
    df['clear_text'] = df['MESSAGE'].map(clean_text)
    df.rename(columns={"Neutral": "tone"},inplace=True)
    df.loc[df['tone']==2,'tone']=1
    #df=df[df.tone<2]
    return df

In [None]:
def check_dataset():
    path_no1 = os.path.join(path_data, 'check_tonal.csv')
    df = pd.read_csv(path_no1)
    df['clear_text'] = df['MESSAGE'].map(clean_text)
    df.rename(columns={"Neutral": "tone"},inplace=True)
    #df=df[df.tone<2]
    return df

In [None]:
def open_dataset():
    #dtype = {'clear_text': str, 'tone': np.int64}
    path_no1 = os.path.join(path_dataset, 'telecom_total.csv')
    df = pd.read_csv(path_no1)
    df['clear_text'] = df['clear_text'].map(clean_text)
    #df.rename(columns={"col": "clear_text", "code": "tone"},inplace=True)
    #df=df[["clear_text","tone"]]
    return df

In [None]:
def predict_dataset():
    path_no1 = os.path.join(path_data, 'to_mark_negative.csv')
    df = pd.read_csv(path_no1)
    df['clear_text'] = df['MESSAGE'].map(clean_text)
    #df=df[df.spam==0]
    return df

In [None]:
def marked_topics():
    path_no1 = os.path.join(path_dataset, 'marked_topics.csv')
    df = pd.read_csv(path_no1)
    df['clear_text'] = df['text'].map(clean_text)
    return df

In [None]:
df=marked_topics()
df['clear_text'].replace('', np.nan, inplace=True)
df.dropna(subset=['clear_text'], inplace=True)

In [None]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [None]:
%%time
import torch
import torch.nn as nn
#from transformers import BertModel
from transformers import AutoModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 100, 2

        # Instantiate BERT model
        #self.bert = BertModel.from_pretrained(path_model)
        self.bert = AutoModel.from_pretrained(path_model)
        # Instantiate an one-layer feed-forward classifier

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        #print("output shape="+str(outputs.shape))
        # Extract the last hidden state of the token `[CLS]` for classification task
        sentence_embeddings = self.mean_pooling(outputs, attention_mask)
        #last_hidden_state_cls = outputs[0][:, 0, :]
        #print("last_hidden shape="+str(last_hidden_state_cls))
        # Feed input to classifier to compute logits
        #logits = self.classifier(last_hidden_state_cls)

        return sentence_embeddings

CPU times: user 1.48 s, sys: 269 ms, total: 1.74 s
Wall time: 1.63 s


In [None]:
    # Instantiate Bert Classifier
model = BertClassifier(freeze_bert=True)

    # Tell PyTorch to run the model on GPU
model.to(device)

In [None]:
MAX_LEN = 512
#from pytorch_transformers import BertTokenizer
#from transformers import BertTokenizer
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(path_model)
#tokenizer = BertTokenizer.from_pretrained(path_model)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    data1=[sent for sent in data]
    encoded_inputs = tokenizer(data1, padding='max_length' ,return_tensors="pt", max_length=MAX_LEN, truncation=True)
    return encoded_inputs.get('input_ids'),encoded_inputs.get('attention_mask')

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
test_inputs, test_masks = preprocessing_for_bert(df.clear_text)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1)
model.eval()

all_logits = []
#all_logits=torch.tensor((), dtype=torch.float16)

# For each batch in our test set...
for batch in test_dataloader:
    # Load batch to GPU
    b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)
    b_input_ids.to(device)
    b_attn_mask.to(device)
    with torch.no_grad():
      logits = model(b_input_ids, b_attn_mask)
    all_logits.append(logits.cpu().numpy()[0])
      #all_logits.append(logits)
    
    # Concatenate logits from each batch
    #

    


In [None]:
from sklearn.cluster import AgglomerativeClustering


In [None]:
num_clusters = 2
clustering_model = AgglomerativeClustering(n_clusters=num_clusters)
clustering_model.fit(all_logits)
cluster_assignment = clustering_model.labels_

In [None]:
clusters = [[] for _ in range(len(cluster_assignment))]
for sent_id, cluster_label in enumerate(cluster_assignment):
    clusters[cluster_label].append(df.clear_text[sent_id])
clusters.sort(key=lambda x:len(x), reverse=True)

# Ouput

cnt_gourps = 0
text = ""
for c in range(len(clusters)):
    if clusters[c]:
        text += "\n" + "-"*50 + "\n"
        text += "Cluster:%d\n"%c
        text += "\n".join(clusters[c])
        if len(clusters[c])>=2:
            cnt_gourps += 1 
print(cnt_gourps)
path_txt = os.path.join(path_data, 'test_cluter.txt')
with open(path_txt, "w") as f:
    f.write(text)

2


In [None]:
df.drop_duplicates(inplace=True)
df2.drop_duplicates(inplace=True)
df3.drop_duplicates(inplace=True)
df4.drop_duplicates(inplace=True)
df5.drop_duplicates(inplace=True)

In [None]:
print(df.clear_text[df.tone  == 0].count())
print(df.clear_text[df.tone  == 1].count())

2413
2606


In [None]:
print(df2.clear_text[df2.tone  == -1].count())
print(df2.clear_text[df2.tone  == 0].count())
print(df2.clear_text[df2.tone  == 1].count())

5232
33213
2440


In [None]:
print(df3.clear_text[df3.tone  == 0].count())
print(df3.clear_text[df3.tone  == 1].count())

4826
9586


In [None]:
print(df4.clear_text[df4.tone  == 0].count())
print(df4.clear_text[df4.tone  == 1].count())

816
2184


In [None]:
print(df5.clear_text[df5.tone  == 0].count())
print(df5.clear_text[df5.tone  == 1].count())

416
1295


In [None]:
df_negative=df.clear_text[df.tone  == 0]
df2_negative=df2.clear_text[df2.tone  == -1]
df3_negative=df3.clear_text[df3.tone  == 0]
df4_negative=df4.clear_text[df4.tone  == 0]

In [None]:
negative=pd.concat([df_negative,df2_negative,df3_negative,df4_negative])

In [None]:
df_neutral=df.clear_text[df.tone  == 1]
df2_neutral=df2.clear_text[df2.tone  == 0]
df3_neutral=df3.clear_text[df3.tone  == 1]
df4_neutral=df4.clear_text[df4.tone  == 1]
neutral=pd.concat([df_neutral,df2_neutral,df3_neutral,df4_neutral])
neutral.drop_duplicates(inplace=True)
#neutral=pd.concat([df_neutral,df3_neutral])

In [None]:
num_instances=len(neutral)

In [None]:
msk = [False]*num_instances
msk = np.random.rand(num_instances)<0.3
bln_neutral=neutral[msk]

In [None]:
neutralx=df4.clear_text[df4.tone  == 2]

In [None]:
bln_neutral=pd.concat([bln_neutral,neutralx])

In [None]:
df_neutral=pd.DataFrame(bln_neutral)
df_negative=pd.DataFrame(negative)

In [None]:
df_neutral['tone']=1
df_negative['tone']=0

In [None]:
dfx=pd.concat([df_neutral,df_negative])
num_instances=len(dfx)

In [None]:
train=dfx
test=df5

In [None]:
DATA_COLUMN = 'clear_text'
LABEL_COLUMN = 'tone'

In [None]:
print(len(train))
print(len(test))

23100
1706


In [None]:
print(train.clear_text[train.tone  == 0].count())
print(train.clear_text[train.tone  == 1].count())

11453
11647


In [None]:
print(test.clear_text[test.tone  == 0].count())
print(test.clear_text[test.tone  == 1].count())

415
1291


In [None]:
X=df.clear_text.values

In [None]:
X_train, X_val=train['clear_text'],test['clear_text']
y_train, y_val=train['tone'],test['tone']

In [None]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [None]:
MAX_LEN = 300
#from pytorch_transformers import BertTokenizer
#from transformers import BertTokenizer
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(path_model)
#tokenizer = BertTokenizer.from_pretrained(path_model)

# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    data1=[sent for sent in data]
    encoded_inputs = tokenizer(data1, padding='max_length' ,return_tensors="pt", max_length=MAX_LEN, truncation=True)
    return encoded_inputs.get('input_ids'),encoded_inputs.get('attention_mask')

In [None]:
# Concatenate train data and test data
all_tweets = np.concatenate([train.clear_text.values, test.clear_text.values])

# Encode our concatenated data
encoded_tweets = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_tweets]

# Find the maximum length
max_len = max([len(sent) for sent in encoded_tweets])
print('Max length: ', max_len)

Max length:  2927


In [None]:
# Specify `MAX_LEN`


# Print sentence 0 and its encoded token ids
token_ids = list(preprocessing_for_bert([X[0]])[0].squeeze().numpy())
print('Original: ', X[0])
print('Token IDs: ', token_ids)

# Run function `preprocessing_for_bert` on the train set and the validation set
print('Tokenizing data...')
train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

Original:  талгар алматинской обл. инагда скорост прападает и зависает
Token IDs:  [101, 15459, 9013, 76799, 47787, 7792, 852, 14181, 132, 2789, 3091, 2235, 21342, 31957, 22861, 851, 17162, 1828, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train.to_numpy())
val_labels = torch.tensor(y_val.to_numpy())

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=1e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs
    warmup_steps = len(train_dataloader) * 2   ### https://www.kaggle.com/snnclsr/learning-rate-schedulers#get_linear_schedule_with_warmup (total_samples // bs) * 20

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    best_valid_loss =  0.630309 #float('inf')
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'learn rate':^12} | {'Elapsed':^9}")
        print("-"*82)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()
            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {scheduler.get_last_lr()[0]:^2.10f} |  {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*82)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            if val_loss < best_valid_loss:
              best_valid_loss = val_loss
              torch.save(bert_classifier.state_dict(),'/content/drive/My Drive/tonal_model.pt') 

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {'-':^12} | {time_elapsed:^9.2f}")
            print("-"*82)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
#from transformers import AutoModel
#model = AutoModel.from_pretrained(path_model)

In [None]:
set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=50)
train(bert_classifier, train_dataloader, val_dataloader, epochs=50, evaluation=True)

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(bert_classifier.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
   print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
   print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
   print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
#inputs = tokenizer("Ты мне нравишься. Я тебя люблю", return_tensors="pt")
#outputs = model(**inputs)
#outputs

In [None]:
import torch.nn.functional as F
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn import metrics
import matplotlib.pyplot as plt

def evaluate_model(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    #preds = probs[:, 1]
    #print(probs)
    y_pred = np.argmax(probs, axis=1)
    #accuracy = accuracy_score(y_true, y_pred)
    #print(f'Accuracy: {accuracy*100:.2f}%')    
    #print(metrics.confusion_matrix(y_pred, y_true))
    print(metrics.classification_report(y_pred, y_true, digits=3)) 
   

In [None]:
bert_classifier.load_state_dict(torch.load('/content/drive/My Drive/tonal_model.pt', map_location=device))
# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, val_dataloader)
# Evaluate the Bert classifier
evaluate_model(probs, y_val)

              precision    recall  f1-score   support

           0      0.817     0.660     0.730       514
           1      0.864     0.936     0.899      1192

    accuracy                          0.853      1706
   macro avg      0.841     0.798     0.814      1706
weighted avg      0.850     0.853     0.848      1706

