In [1]:
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from torch.optim import AdamW
from torch.utils import data
from sklearn import metrics
import torch.nn as nn
import pandas as pd
import numpy as np
import argparse
import logging
import random
import torch
import sys
import os

In [2]:
import sys

sys.path.append('/Users/annapalatkina/Desktop/big_5_PT')

In [3]:
def seed_everything(seed_value=42):
    os.environ["PYTHONHASHSEED"] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    return seed_value

logging.basicConfig(
    format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)
logger = logging.getLogger(__name__)

_ = seed_everything(42)
logger.info(f"Training with seed {42}...")
model_name = 'xlm-roberta-base'
logger.info(f"Model name {model_name}...")
tokenizer_name = 'xlm-roberta-base'
logger.info(f"Tokenizer name {tokenizer_name}...")
prediction_column = 'Экстраверсия_5'
logger.info(f"Prediction column {prediction_column}...")
LR = 4e-5
logger.info(f"Learning rate {LR}...")
dropout = 0.3
logger.info(f"Dropout  {dropout}...")
epochs = 1
logger.info(f"Epochs {epochs}...")
maxl = 10
logger.info(f"Max length {maxl}...")
minl = 0
logger.info(f"Min length {minl}...")
bsize = 4
logger.info(f"Batch size {bsize}...")
warmup_steps = 1000
logger.info(f"Warmup steps {warmup_steps}...")
freeze = False
logger.info(f"Freeze {freeze}...")

2023-05-24 22:52:56,131 : INFO : Training with seed 42...
2023-05-24 22:52:56,132 : INFO : Model name xlm-roberta-base...
2023-05-24 22:52:56,132 : INFO : Tokenizer name xlm-roberta-base...
2023-05-24 22:52:56,132 : INFO : Prediction column Экстраверсия_5...
2023-05-24 22:52:56,133 : INFO : Learning rate 4e-05...
2023-05-24 22:52:56,133 : INFO : Dropout  0.3...
2023-05-24 22:52:56,133 : INFO : Epochs 1...
2023-05-24 22:52:56,134 : INFO : Max length 10...
2023-05-24 22:52:56,135 : INFO : Min length 0...
2023-05-24 22:52:56,135 : INFO : Batch size 4...
2023-05-24 22:52:56,136 : INFO : Warmup steps 1000...
2023-05-24 22:52:56,136 : INFO : Freeze False...


In [4]:
def encoder(labels, texts, cur_tokenizer, cur_device):
    labels_tensor = torch.tensor(labels, dtype=torch.long).to(cur_device)
    encoding = cur_tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=maxl,
    ).to(cur_device)
    return labels_tensor, encoding

def create_ids_mask(dataset):
    texts = dataset.post_text.to_list()
    labels = dataset[prediction_column].to_list()
    labels_tensor, encoding = encoder(labels, texts, tokenizer, device)
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]

    return data.TensorDataset(input_ids, attention_mask, labels_tensor) 

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}...")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

dataset = pd.read_csv('/Users/annapalatkina/Desktop/big_5_PT/data/df_texts_ocean.csv')
dataset['lengths'] = dataset.post_text.apply(lambda x: len(str(x).split()))
df = dataset.query('lengths > @minl')[['post_text', prediction_column]]

train_data, valid_data = train_test_split(df, test_size=0.3, stratify=df[prediction_column], random_state=42)
valid_data, test_data = train_test_split(valid_data, test_size=0.5, stratify=valid_data[prediction_column], random_state=42)

# Oversampling
# !!!!!!!!!! СДЕЛАТЬ ОВЕРСЭМПЛИНГ ДЛЯ КАЖДОЙ ЧЕРТЫ РАЗНЫЙ !!!!!!!!!!
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
train_data = pd.concat([train_data,
                    train_data.query('Экстраверсия_5 == 0'),
                    train_data.query('Экстраверсия_5 == 0'),
                    train_data.query('Экстраверсия_5 == 4'),
                    train_data.query('Экстраверсия_5 == 4'),
                    train_data.query('Экстраверсия_5 == 4'),
                    train_data.query('Экстраверсия_5 == 4')])
    
train_data = shuffle(train_data)
train_data.reset_index(inplace=True, drop=True)
valid_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

logger.info(f"Train dataset with oversampling shape: {train_data.shape}...") 
logger.info(f"Valid dataset shape: {valid_data.shape}...")
logger.info(f"Test dataset shape: {test_data.shape}...")

num_classes = train_data[prediction_column].nunique()
logger.info(f"We have {num_classes} classes")

logger.info(f"Tokenizing with max length {maxl}...")
train_dataset = create_ids_mask(train_data)
train_iter = data.DataLoader(train_dataset, batch_size=bsize, shuffle=True) 

dev_dataset = create_ids_mask(valid_data)
dev_iter = data.DataLoader(dev_dataset, batch_size=bsize, shuffle=False)

test_dataset = create_ids_mask(test_data)
test_iter = data.DataLoader(test_dataset, batch_size=bsize, shuffle=False)
logger.info("Tokenizing finished.")

2023-05-24 22:52:56,180 : INFO : Using device: cpu...
2023-05-24 22:53:00,862 : INFO : Train dataset with oversampling shape: (214824, 2)...
2023-05-24 22:53:00,862 : INFO : Valid dataset shape: (37459, 2)...
2023-05-24 22:53:00,863 : INFO : Test dataset shape: (37460, 2)...
2023-05-24 22:53:00,865 : INFO : We have 5 classes
2023-05-24 22:53:00,865 : INFO : Tokenizing with max length 10...
2023-05-24 22:53:24,483 : INFO : Tokenizing finished.


In [6]:
for input_ids, attention_mask, labels_tensor in train_iter:
  break

In [7]:
from models import RNN
model = RNN(model_name=model_name,num_classes=num_classes,freeze_bert=True, attention=True).to(device)
model(input_ids, attention_mask)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 0.0708, -0.0036, -0.0123, -0.0418,  0.0447],
        [ 0.0810, -0.0207, -0.0213, -0.0324,  0.0488],
        [ 0.0722, -0.0138, -0.0271, -0.0307,  0.0479],
        [ 0.0763, -0.0066, -0.0184, -0.0268,  0.0353]],
       grad_fn=<AddmmBackward0>)

In [8]:
from models import CNN
model = CNN(model_name=model_name,num_classes=num_classes,freeze_bert=True, attention=True).to(device)
model(input_ids, attention_mask)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 0.4211,  0.3321, -0.2421,  0.0454, -0.0578],
        [ 0.4188,  0.3093, -0.2395,  0.0343, -0.0471],
        [ 0.4185,  0.3121, -0.2403,  0.0360, -0.0484],
        [ 0.4224,  0.3311, -0.2408,  0.0443, -0.0574]],
       grad_fn=<AddmmBackward0>)

In [9]:
from models import CNN_LSTM
model = CNN_LSTM(model_name=model_name,num_classes=num_classes,freeze_bert=True, attention=True).to(device)
model(input_ids, attention_mask)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 0.1112, -0.0197, -0.2082,  0.0607,  0.1781],
        [ 0.1077, -0.0245, -0.2160,  0.0728,  0.1435],
        [ 0.1138, -0.0374, -0.2074,  0.0559,  0.1829],
        [ 0.0869, -0.0272, -0.2291,  0.0663,  0.1662]],
       grad_fn=<AddmmBackward0>)

In [7]:
from models import Bert_sequence_classifcation
model = Bert_sequence_classifcation(model_name=model_name,num_classes=num_classes).to(device)
model(input_ids, attention_mask)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

tensor([[ 0.2331,  0.3372,  0.0433,  0.1241, -0.1168],
        [ 0.2185,  0.3141,  0.0783,  0.1195, -0.1413],
        [ 0.2153,  0.3142,  0.0818,  0.1148, -0.1319],
        [ 0.2131,  0.3121,  0.0721,  0.1236, -0.1338]],
       grad_fn=<AddmmBackward0>)

In [11]:
class BertBilstmClassifier(nn.Module):
    def __init__(self, model_name, num_classes, freeze_bert=True):
        super(BertBilstmClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 128, num_classes
        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained(model_name)
        # Instantiate an one-layer feed-forward classifier
        self.linear_relu = nn.Sequential(
            nn.Linear(2*H, H), # if BiLSTM
            # nn.Linear(H, H), # if LSTM
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(H, H), 
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(H, H)
        )
        # LSTM
        # self.bilstm = nn.LSTM(D_in, H, batch_first = False, bidirectional=False)
        # BiLSTM
        self.bilstm = nn.LSTM(D_in, H, batch_first = False, bidirectional=True, dropout=0.2, num_layers=3)
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False        
        self.out = nn.Linear(H, D_out)
        
    def forward(self, input_ids, attention_mask):
        # Feed input to BERT
        bert_last_hidden_state = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask).last_hidden_state   # [Batch_size, max_length, 768]
        print(f'Bert shape: {bert_last_hidden_state.shape}')   

        output_lstm, (h_0, h_1) =  self.bilstm(bert_last_hidden_state)
        print(f'Output lstm shape: {output_lstm.shape}')    # [Batch_size, max_length, 2*H]
        x = output_lstm 
        attention_weights = torch.matmul(x, x.transpose(-1, -2))  # Compute attention weights
        attention_weights = torch.softmax(attention_weights, dim=-1)  # Apply softmax to get attention probabilities
        x = torch.matmul(attention_weights, x)  # Apply attention
        x = x[:, 0]  # Use the [CLS] token representation for aggregation  # [Batch_size, 2*H]
   
        print(f'CLS representation shape: {x.shape}')

        # Linear layers + dropout + relu 
        x = self.linear_relu(x)    # [Batch_size, H]
        print(f'Shape after linear + relu: {x.shape}')

        # residual connection
        #x = torch.cat([bigru_cnn.squeeze(dim=2), bilstm_cnn.squeeze(dim=2)], dim=1)

        # Feed x to classifier to compute logits
        logits = self.out(x)

        return logits

In [33]:
import torch.nn.functional as F

class BertBilstmClassifier(nn.Module):
    def __init__(self, model_name, num_classes, freeze_bert=True):
        super(BertBilstmClassifier, self).__init__()

        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out, H_fc = 768, 128, num_classes, 32

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained(model_name)
        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False   

        # LSTM
        # self.bilstm = nn.LSTM(D_in, H, batch_first = False, bidirectional=False)
        # BiLSTM
        self.bilstm = nn.LSTM(D_in, H, batch_first = False, bidirectional=True, dropout=0.2, num_layers=3)

        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=768,
                      out_channels= [2, 2, 2][i],
                      kernel_size=[2, 3, 4][i])
            for i in range(len([2, 3, 4]))
        ])

        # 2 liner layers + activation
        self.linear_relu = nn.Sequential(
            nn.Linear(12, H_fc), 
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(H_fc, H_fc), 
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(H_fc, H_fc)
        )
        self.fc =  nn.Linear(256, 6)
        self.out = nn.Linear(H_fc, D_out)
        
    def forward(self, input_ids, attention_mask):
        # Feed input to BERT
        bert_last_hidden_state = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask).last_hidden_state   # [Batch_size, max_length, 768]

        # CNN
        bert_reshaped = bert_last_hidden_state.permute(0, 2, 1)
        # Apply CNN and ReLU. Output shape: (Batch_size, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(bert_reshaped)) for conv1d in self.conv1d_list]
        # Max pooling. Output shape: (Batch_size, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (Batch_size, sum(num_filters))
        x_cnn = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)


        output_lstm, (h_0, h_1) =  self.bilstm(bert_last_hidden_state) # [Batch_size, max_length, 2*H]
        x = output_lstm 
        attention_weights = torch.matmul(x, x.transpose(-1, -2))  # Compute attention weights
        attention_weights = torch.softmax(attention_weights, dim=-1)  # Apply softmax to get attention probabilities
        x = torch.matmul(attention_weights, x)  # Apply attention
        x = x[:, 0]  # Use the [CLS] token representation for aggregation  # [Batch_size, 2*H]
        x_lstm = self.fc(x)

        cnn_lstm = torch.cat((x_cnn,x_lstm),dim=1)
        # Linear layers + dropout + relu 
        x = self.linear_relu(cnn_lstm)   # [Batch_size, H]

        # Feed x to classifier to compute logits
        logits = self.out(x)

        return logits

In [34]:
model = BertBilstmClassifier(model_name=model_name,num_classes=num_classes,freeze_bert=True).to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [35]:
model(input_ids, attention_mask)

tensor([[ 0.0183, -0.1331, -0.0812, -0.0150,  0.0642],
        [ 0.0185, -0.1434, -0.0862, -0.0258,  0.0461],
        [ 0.0318, -0.1345, -0.0934, -0.0381,  0.0760],
        [ 0.0076, -0.1390, -0.0819, -0.0365,  0.0887]],
       grad_fn=<AddmmBackward0>)