In [1]:
!nvidia-smi

import os
import warnings
from IPython.display import clear_output

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
warnings.filterwarnings('ignore')

Mon Oct  2 19:55:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from transformers import AutoModel, DebertaV2Config
from transformers.models.deberta.modeling_deberta import ContextPooler
pl.seed_everything(56)

56

In [3]:
class CFG:
    wandb=False
    num_workers=4
    model="ai-forever/ruElectra-medium"
    train_path='/kaggle/input/olympii0930/train.csv'
    test_path = '/kaggle/input/olympii0930/test.csv'
    hidden_size = 576
    pooler_hidden_size = 576
    pooler_hidden_act = 'gelu'
    pooler_dropout = 0.1
    val_split_size = 0.2
    num_labels = 2
    scheduler='cosine'
    max_epoches=15
    lr=2e-5
    min_lr=5e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    weight_decay=0.0
    gradient_accumulation_steps=1
    seed=56

In [4]:
class ReceiptsDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        super().__init__()
        self.cfg = CFG()
        self.data = df[['text','label']]
        self.data = self.data.values
        self.embedings = []
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.model)
    def __getitem__(self, index):
        text = self.data[index][0]
        label = self.data[index][1]
        text_encode = self.tokenizer.encode_plus(text, padding='max_length',max_length=128, truncation=True, return_tensors='pt')
        return text_encode['input_ids'][0],text_encode['attention_mask'][0],label
    def __len__(self):
        return len(self.data)

In [5]:
def make_df(data):
    df = []
    for score,text in zip(data.Score,data.Text):
        df += [{'text':text,'label':1 if score == "Positive" else 0}]
    return pd.DataFrame(df)
def prepare_text(x:str):
    x = x.strip().lower()
    return x

In [6]:
class ReceiptsDataModule(pl.LightningDataModule):
    def __init__(self,):
        super().__init__()
        self.cfg = CFG()
        self.train_dataset_path = self.cfg.train_path
        self.test_dataset_path = self.cfg.test_path
        self.val_split_size = self.cfg.val_split_size
        self.batch_size = self.cfg.batch_size
        self.num_workers = self.cfg.num_workers
        self.is_setup = False
    def prepare_data(self):
        self.train_df = make_df(pd.read_csv(self.train_dataset_path,delimiter='	'))
        self.train_df['text'] = self.train_df['text'].apply(prepare_text)
        self.test_df = pd.read_csv(self.test_dataset_path,delimiter='	')
        self.test_df['Score'] = 0
        self.test_df = make_df(self.test_df)
        self.test_df['text'] = self.test_df['text'].apply(prepare_text)
        
    def setup(self, stage: str):
        if self.is_setup:
            return None
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size,random_state=self.cfg.seed)
        self.train_dataset = ReceiptsDataset(self.train_df)
        self.val_dataset = ReceiptsDataset(self.val_df)
        self.predict_dataset = ReceiptsDataset(self.test_df)
        self.is_setup = True
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           shuffle=True)
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           shuffle = False)

In [1]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class ReceiptsModule(pl.LightningModule):
    def __init__(self,):
        super().__init__()
        self.cfg = CFG()
        self.bert_encoder = AutoModel.from_pretrained(self.cfg.model)
        #self.mlp = nn.Sequential(nn.Linear(768,768*2),
        #                         nn.LayerNorm(768*2),
        #                         nn.ReLU(),
        #                         nn.Linear(768*2,self.cfg.num_labels))
        self.mlp = nn.Linear(self.cfg.hidden_size,self.cfg.num_labels)
        #self.pool = MeanPooling()
        self.pool = ContextPooler(self.cfg)
        self.criterion = nn.CrossEntropyLoss()
        self.val_targets = []
        self.val_preds = []
        self.get_features = False
    def get_textual_features(self,input_ids,attention_mask):
        return self.bert_encoder(input_ids=input_ids,
                                 attention_mask=attention_mask).last_hidden_state
    def forward(self, x1,x2):
        #features = self.pool(self.get_textual_features(x1,x2),x2)
        features = self.pool(self.get_textual_features(x1,x2))
        return self.mlp(features)
    
    def training_step(self, batch, _):
        x1,x2, targets = batch
        logits = self(x1,x2)
        loss = self.criterion(logits, targets)
        return loss
        
    def validation_step(self, batch, _):
        x1,x2, targets = batch
        logits = self(x1,x2).argmax(dim=-1).cpu().detach().tolist()
        self.val_targets += targets.tolist()
        self.val_preds += logits
        
    def predict_step(self, batch, _):
        x1,x2, _ = batch
        if not self.get_features:
            logits = self(x1,x2).argmax(dim=-1).cpu().detach().tolist()
            return logits
        else:
            features = self.pool(self.get_textual_features(x1,x2),x2).cpu().detach().tolist()
            return features
        
    def calc_metric(self):
        return 2*accuracy_score(self.val_targets,self.val_preds) -1
        
    def on_validation_epoch_end(self):
        print(self.calc_metric())
        self.val_targets, self.val_preds = [],[]
        assert 1 == 2
            
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(),
                                 self.cfg.lr,
                                 weight_decay=self.cfg.weight_decay,
                                 betas = self.cfg.betas
                                )

NameError: name 'nn' is not defined

In [8]:
dm = ReceiptsDataModule()
dm.prepare_data()
dm.setup(0)

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/909k [00:00<?, ?B/s]

In [10]:
model = ReceiptsModule()

Downloading pytorch_model.bin:   0%|          | 0.00/356M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai-forever/ruElectra-medium were not used when initializing ElectraModel: ['generator.encoder.layer.7.attention.self.query.weight', 'generator.encoder.layer.10.output.dense.bias', 'generator.encoder.layer.3.attention.output.dense.bias', 'generator.encoder.layer.2.attention.self.value.weight', 'generator.encoder.layer.9.attention.output.LayerNorm.bias', 'generator.encoder.layer.1.output.LayerNorm.bias', 'generator.encoder.layer.0.attention.self.value.weight', 'generator.encoder.layer.5.attention.output.LayerNorm.bias', 'generator.embeddings_project.weight', 'generator.encoder.layer.8.attention.output.LayerNorm.bias', 'generator.encoder.layer.7.attention.self.query.bias', 'generator.encoder.layer.6.output.dense.bias', 'generator.encoder.layer.8.intermediate.dense.bias', 'generator.encoder.layer.3.intermediate.dense.bias', 'generator.encoder.layer.0.attention.output.LayerNorm.weight', 'generator.encoder.layer.8.output.dense.weight', 'generator.encod

In [11]:
logger = pl.loggers.TensorBoardLogger("tb_logs", name="text_cls")
trainer = pl.Trainer(
    accelerator="gpu",
    logger=logger,
    max_epochs=15,
    log_every_n_steps=1
)

In [12]:
trainer.fit(model, datamodule=dm)

Sanity Checking: 0it [00:00, ?it/s]

0.125


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

0.7664285714285715


Validation: 0it [00:00, ?it/s]

0.7964285714285715


Validation: 0it [00:00, ?it/s]

0.7885714285714285


Validation: 0it [00:00, ?it/s]

0.7849999999999999


Validation: 0it [00:00, ?it/s]

0.7757142857142858


Validation: 0it [00:00, ?it/s]

0.7614285714285713


Validation: 0it [00:00, ?it/s]

0.792142857142857


Validation: 0it [00:00, ?it/s]

0.7985714285714285


Validation: 0it [00:00, ?it/s]

0.8042857142857143


Validation: 0it [00:00, ?it/s]

0.8085714285714285


Validation: 0it [00:00, ?it/s]

0.8135714285714286


Validation: 0it [00:00, ?it/s]

0.7835714285714286


Validation: 0it [00:00, ?it/s]

0.8042857142857143


Validation: 0it [00:00, ?it/s]

0.7857142857142858


Validation: 0it [00:00, ?it/s]

0.7907142857142857


In [12]:
preds = trainer.predict(model,dm.predict_dataloader())

Predicting: 0it [00:00, ?it/s]

In [16]:
preds = np.concatenate(preds)

In [17]:
preds

0       Positive
1       Negative
2       Negative
3       Negative
4       Negative
          ...   
5995    Positive
5996    Positive
5997    Negative
5998    Positive
5999    Positive
Length: 6000, dtype: object

In [15]:
preds = pd.Series(preds).apply(lambda x:"Positive" if x == 1 else "Negative")

In [18]:
sample_submit = pd.read_csv('/kaggle/input/olympii0930/sample_submission.csv',delimiter='	')

In [19]:
sample_submit

Unnamed: 0,idx,Score
0,13999,Positive
1,14000,Positive
2,14001,Positive
3,14002,Positive
4,14003,Positive
...,...,...
5995,19994,Positive
5996,19995,Positive
5997,19996,Positive
5998,19997,Positive


In [20]:
import csv
with open('submitt.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter='	')
    spamwriter.writerow(['idx','Score'])
    for i, el in zip(sample_submit.idx,preds):
        spamwriter.writerow([i, el])

In [21]:
pd.read_csv('/kaggle/working/submitt.csv',delimiter='	')

Unnamed: 0,idx,Score
0,13999,Positive
1,14000,Negative
2,14001,Negative
3,14002,Negative
4,14003,Negative
...,...,...
5995,19994,Positive
5996,19995,Positive
5997,19996,Negative
5998,19997,Positive
