In [1]:
from pathlib import Path
import gc
from datetime import datetime

from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from transformers import PreTrainedModel, AutoConfig
from tqdm.notebook import tqdm

import dataset

# DEVICE = torch.device("cpu")
DEVICE = torch.device("cuda:0")

# pretrained_model_name = "DeepPavlov/distilrubert-tiny-cased-conversational-v1"
# pretrained_model_name = "cointegrated/rubert-tiny"
pretrained_model_name = "cointegrated/rubert-tiny2"
# pretrained_model_name = "DeepPavlov/rubert-base-cased"


In [2]:
allowed_labels = list(set(Path("./allowedLabels.txt").read_text().lower().split("\n")))

labels_encoder = preprocessing.LabelEncoder()
labels_encoder.fit(allowed_labels)

orig_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [4]:
columns = pd.read_csv("../columns.csv", sep="<")["column"].astype(str).to_list()
tokenizer = orig_tokenizer.train_new_from_iterator([columns], 90000)







In [5]:
import importlib
importlib.reload(dataset)

paths = list(Path("../filteredData2").glob("./*/*.csv"))
train_paths, val_paths = train_test_split(paths, test_size=0.20,
                                          random_state=42)

train_dataloader = dataset.Tables(train_paths, tokenizer, labels_encoder, use_rand=True).create_dataloader(
    batch_size=80,
    shuffle=True)
val_dataloader = dataset.Tables(
    val_paths, tokenizer, labels_encoder).create_dataloader(batch_size=100)

columns_dataloader = dataset.Columns("../columns.csv", tokenizer, labels_encoder).create_dataloader(batch_size=50, shuffle=True)


In [6]:
class Model(PreTrainedModel):
    def __init__(self, config, labels_number):
        super().__init__(config)
        self.labels_number = labels_number
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.bert.resize_token_embeddings(tokenizer.vocab_size)
        self.dropout = nn.Dropout(p=0.2)
        self.tanh = nn.Tanh()
        self.linear = nn.Linear(self.bert.config.hidden_size, labels_number)
        self.linear.weight.data.uniform_(0.0, 1.0)

    def forward(self, input_ids):
        output = self.bert(input_ids=input_ids,
                           return_dict=False)[0]
        output = self.dropout(output)
        output = self.tanh(output)
        output = self.linear(output)
        output = output.squeeze(0)

        if len(output.shape) == 2:
            output = output.unsqueeze(0)

        cls_ids = torch.nonzero(input_ids == tokenizer.cls_token_id)
        filtered_logits = torch.zeros(cls_ids.shape[0], output.shape[2])

        for n in range(cls_ids.shape[0]):
            i, j = cls_ids[n]
            filtered_logits[n] = output[i, j, :]

        return filtered_logits


config = AutoConfig.from_pretrained(pretrained_model_name)

config.update({"hidden_dropout_prob": 0.2,
               "layer_norm_eps": 1e-7})

model = Model(config, labels_number=len(allowed_labels)).to(DEVICE)


Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
def loss_fn(logits, targets):
    criterion = nn.CrossEntropyLoss()
    return criterion(logits, targets)


In [8]:
def train_epoch(model, data_loader, loss_eval, optimizer, scheduler):
    model.train()
    losses = []
    for batch, idx in zip(tqdm(data_loader), range(len(data_loader))):
        input_ids = batch["input_ids"].to(DEVICE)
        targets = batch["labels"]
        outputs = model(input_ids=input_ids)
        loss = loss_eval(outputs, targets)
        losses.append(loss.item())
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        if idx % 100 == 0:
            print(np.mean(losses), loss.item())
    scheduler.step()
    return np.mean(losses)


def eval_model(model, data_loader, loss_eval):
    model.eval()

    true_labels = []
    predicted_labels = []

    losses = []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch["input_ids"].to(DEVICE)
            targets = batch["labels"]
            outputs = model(input_ids=input_ids)
            loss = loss_eval(outputs, targets)
            losses.append(loss.item())

            targets = targets.cpu()
            true_labels += list(targets[targets != -1])
            predicted_labels += nn.Softmax(dim=1)(outputs.cpu()).argmax(
                dim=1).tolist()


    return np.mean(losses), f1_score(true_labels, predicted_labels, average='macro')


In [34]:
val_loss, f1_macro = eval_model(model,
                      val_dataloader,
                      loss_fn)

gc.collect()
torch.cuda.empty_cache()
print(f'Val loss: {round(val_loss, 4)}')
print(f'F1 macro: {round(f1_macro, 4)}\n')


  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 7.6468
F1 macro: 0.0012



In [14]:
# optimizer = AdamW(model.parameters(), lr=1e-6, weight_decay=1e-4, eps=1e-8)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.7)

In [15]:
# from collections import defaultdict

# history = defaultdict(list)

# gc.collect()
# torch.cuda.empty_cache()

# for epoch in range(1):
#     print(f'Epoch: {epoch + 1}')
#     print('-' * 10)
#     # TRAIN
#     train_loss = train_epoch(model,
#                              columns_dataloader,
#                              loss_fn,
#                              optimizer,
#                              scheduler)

#     gc.collect()
#     torch.cuda.empty_cache()
#     print(f'Train loss: {round(train_loss, 4)}\n')
#     torch.save(model.state_dict(), f'./checkpoints/{datetime.now().strftime("%Y-%m-%d %H-%M-%S")}')
#     # -----------------------------------------------------
#     #     EVAL
#     val_loss, f1_macro = eval_model(model,
#                           val_dataloader,
#                           loss_fn)

#     gc.collect()
#     torch.cuda.empty_cache()
#     print(f'Val loss: {round(val_loss, 4)}')
#     print(f'F1 macro: {round(f1_macro, 4)}\n')
#     history['val_loss'].append(val_loss)

#     # ------------------------------------------------------
#     history['train_loss'].append(train_loss)



Epoch: 1
----------


  0%|          | 0/2606 [00:00<?, ?it/s]

6.707839488983154 6.707839488983154
6.8604806720620335 6.489990711212158
6.540790541255059 5.645623207092285
6.280478862432942 5.433619976043701
6.062109779538656 5.1800360679626465
5.879879216234127 3.914008378982544
5.728255384178606 4.6069536209106445
5.587062860861654 4.717736721038818
5.464546987924088 4.528722763061523
5.348521379995822 4.1830854415893555
5.246956463460322 3.6162164211273193
5.145872149870247 4.555241584777832
5.064379196182873 5.033581733703613
4.986481521974428 3.9801628589630127
4.910168980462989 4.26601505279541
4.845529437462224 3.9654064178466797
4.780593445269783 3.710418701171875
4.720501804828364 3.855602502822876
4.663536426161873 3.799208164215088
4.607157821093403 3.187418222427368
4.556068330809571 3.729335069656372
4.505960246258154 3.5307412147521973
4.460263527073355 3.370115041732788
4.413795658320461 3.5727264881134033
4.3706481054592805 3.6931838989257812
4.327967568713634 2.8654513359069824
4.2907138421690405 3.5085361003875732
Train loss: 4.2

  0%|          | 0/40 [00:00<?, ?it/s]

Val loss: 4.1717
F1 macro: 0.0125



In [9]:
# for param in model.bert.parameters():
#     param.requires_grad = True

optimizer = AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4, eps=1e-8)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.7)

In [10]:
from collections import defaultdict

history = defaultdict(list)

gc.collect()
torch.cuda.empty_cache()

for epoch in range(8):
    print(f'Epoch: {epoch + 1}')
    print('-' * 10)
    # TRAIN
    train_loss = train_epoch(model,
                             train_dataloader,
                             loss_fn,
                             optimizer,
                             scheduler)

    gc.collect()
    torch.cuda.empty_cache()
    print(f'Train loss: {round(train_loss, 4)}\n')
    torch.save(model.state_dict(), f'./checkpoints/{datetime.now().strftime("%Y-%m-%d %H-%M-%S")}')
    # -----------------------------------------------------
    #     EVAL
    val_loss, f1_macro = eval_model(model,
                          val_dataloader,
                          loss_fn)

    gc.collect()
    torch.cuda.empty_cache()
    print(f'Val loss: {round(val_loss, 4)}')
    print(f'F1 macro: {round(f1_macro, 4)}\n')
    history['val_loss'].append(val_loss)

    # ------------------------------------------------------
    history['train_loss'].append(train_loss)



Epoch: 1
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
8.463425636291504 8.463425636291504
3.1125778644391806 1.8406994342803955
2.302556783702243 1.1832391023635864
1.8777047603629355 0.7698885202407837
1.6142128251883157 0.512525200843811
Train loss: 1.4451



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.5492
F1 macro: 0.4375

Epoch: 2
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.5614535212516785 0.5614535212516785
0.4608249482837054 0.41856393218040466
0.4215314002772469 0.32090625166893005
0.40052660596727135 0.28796881437301636
0.3791179072083975 0.2633626461029053
Train loss: 0.3681



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2753
F1 macro: 0.6471

Epoch: 3
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.22877277433872223 0.22877277433872223
0.19184327490701533 0.16863662004470825
0.18310262896676563 0.16169458627700806
0.18210331865422352 0.21398389339447021
0.1794204004201806 0.18831336498260498
Train loss: 0.1763



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2185
F1 macro: 0.7634

Epoch: 4
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.127446249127388 0.127446249127388
0.11119942344946436 0.08212170004844666
0.10664195607217093 0.051933035254478455
0.10376864614156987 0.21428291499614716
0.10230658192215418 0.04335680603981018
Train loss: 0.1028



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2169
F1 macro: 0.8167

Epoch: 5
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.08745516836643219 0.08745516836643219
0.07133479608167516 0.1062089055776596
0.07203487473173965 0.0770566463470459
0.06728235252690425 0.037867702543735504
0.06796358617840154 0.041149210184812546
Train loss: 0.0673



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2015
F1 macro: 0.8594

Epoch: 6
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.032790251076221466 0.032790251076221466
0.052680370185782414 0.04514368623495102
0.051887175707095556 0.029131295159459114
0.05076744633864463 0.010911066085100174
0.050664014428879536 0.009743510745465755
Train loss: 0.0498



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2011
F1 macro: 0.8636

Epoch: 7
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.03381166234612465 0.03381166234612465
0.04435122543761488 0.031048865988850594
0.040258183993111867 0.04529630020260811
0.038529433004490014 0.015459532849490643
0.03643706762998312 0.029008954763412476
Train loss: 0.0368



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2005
F1 macro: 0.8803

Epoch: 8
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.012739848345518112 0.012739848345518112
0.03539957270420405 0.014473733492195606
0.03351301033674178 0.026404788717627525
0.031730751495574756 0.0011600067373365164
0.031104278572030283 0.0207356009632349
Train loss: 0.0305



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2003
F1 macro: 0.8901



In [11]:
from collections import defaultdict

history = defaultdict(list)

gc.collect()
torch.cuda.empty_cache()

for epoch in range(1):
    print(f'Epoch: {epoch + 1}')
    print('-' * 10)
    # TRAIN
    train_loss = train_epoch(model,
                             train_dataloader,
                             loss_fn,
                             optimizer,
                             scheduler)

    gc.collect()
    torch.cuda.empty_cache()
    print(f'Train loss: {round(train_loss, 3)}\n')
    torch.save(model.state_dict(), f'./checkpoints/{datetime.now().strftime("%Y-%m-%d %H-%M-%S")}')
    # -----------------------------------------------------
    #     EVAL
    val_loss, f1_macro = eval_model(model,
                          val_dataloader,
                          loss_fn)

    gc.collect()
    torch.cuda.empty_cache()
    print(f'Val loss: {round(val_loss, 4)}')
    print(f'F1 macro: {round(f1_macro, 4)}\n')
    history['val_loss'].append(val_loss)

    # ------------------------------------------------------
    history['train_loss'].append(train_loss)



Epoch: 1
----------


  0%|          | 0/489 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
0.016507921740412712 0.016507921740412712
0.02641747104543455 0.0489533431828022
0.02447485111871577 0.007935930974781513
0.024369739701095783 0.058784909546375275
0.026427221185170078 0.01152932271361351
Train loss: 0.026



  0%|          | 0/98 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Val loss: 0.2074
F1 macro: 0.8914



***

In [60]:

MAX_TOKENS_PER_COLUMN = 200  # 2 of those for CLS and SEP
MAX_COLUMNS = 6
MAX_TOKENS = 200  # per table

true_labels = []
predicted_labels = []



num = np.random.randint(0, len(val_paths))
print(num)

df = pd.read_csv(val_paths[num], sep="|")
tokens = []
with open(val_paths[num]) as file:
    labels = file.readline().lower().rstrip('\n').split("|")

# print(labels)
assert len(labels) == len(df.columns)

columns = df.columns[:MAX_COLUMNS]
labels = labels[:MAX_COLUMNS]

tokens_per_column = min(
    MAX_TOKENS // len(labels), MAX_TOKENS_PER_COLUMN)

for label, _ in zip(df.columns, range(MAX_COLUMNS)):
    str_repr_of_column = df[label].astype(str).str.cat(sep=" ")
    tokens += tokenizer(str_repr_of_column, truncation=True,
                                max_length=tokens_per_column).input_ids

labels = labels_encoder.transform(labels)[:MAX_COLUMNS]

# for label, _ in zip(df.columns, range(MAX_COLUMNS)):
#     str_repr_of_column = df[label].astype(str).str.cat(sep=" ")[:40]
#     tokens += tokenizer(str_repr_of_column, truncation=True,
#                         padding='max_length', max_length=10).input_ids

# tokens = [np.pad(tokens, (0, MAX_TOKENS), 'constant')[:MAX_TOKENS]]
# labels = labels_encoder.transform(labels)

tokens = torch.tensor([tokens]).to(DEVICE)

# print(tokens)

result = nn.Softmax(dim=1)(model(tokens)).argmax(dim=1)

predicted_labels += result.tolist()
true_labels += list(labels)
print("pred:", labels_encoder.inverse_transform(result))
print("true:", labels_encoder.inverse_transform(labels))
# print(model(tokens).shape)
#


# print(tokens)

# print(model(tokens))


7099


RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasLtMatmul( ltHandle, computeDesc.descriptor(), &alpha_val, mat1_ptr, Adesc.descriptor(), mat2_ptr, Bdesc.descriptor(), &beta_val, result_ptr, Cdesc.descriptor(), result_ptr, Cdesc.descriptor(), &heuristicResult.algo, workspace.data_ptr(), workspaceSize, at::cuda::getCurrentCUDAStream())`

In [25]:
from collections import Counter

def find_problematic_labels(model, data_loader):
    model.eval()

    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch["input_ids"].to(DEVICE)
            targets = batch["labels"]
            outputs = model(input_ids=input_ids)

            targets = targets.cpu()
            true_labels += list(targets[targets != -1])
            predicted_labels += nn.Softmax(dim=1)(outputs.cpu()).argmax(
                dim=1).tolist()


    c = Counter()
    for x, y in zip(list(true_labels), predicted_labels):
        if x.item() != y:
            c[( labels_encoder.inverse_transform([x.item()])[0],  labels_encoder.inverse_transform([y])[0])] += 1

    return c

counter = find_problematic_labels(model, val_dataloader)


  0%|          | 0/66 [00:00<?, ?it/s]

In [26]:
counter.most_common(30)


[(('актёр', 'роль'), 48),
 (('название', 'песня'), 27),
 (('название', 'альбом'), 26),
 (('продюсер', 'режиссёр'), 25),
 (('роль', 'актёр'), 25),
 (('продюсер', 'сценарист'), 23),
 (('название', 'фильм'), 20),
 (('персонаж', 'роль'), 19),
 (('песня', 'название'), 19),
 (('название', 'русское название'), 17),
 (('режиссёр', 'сценарист'), 15),
 (('номер', 'место'), 15),
 (('команда', 'страна'), 14),
 (('альбом', 'название'), 13),
 (('сценарист', 'режиссёр'), 12),
 (('место', 'номер'), 12),
 (('ссылка', 'источник'), 11),
 (('годы', 'год'), 11),
 (('фильм', 'название'), 11),
 (('режиссёр', 'роль'), 11),
 (('спортсмен', 'имя'), 11),
 (('позиция', 'место'), 9),
 (('актёр', 'сценарист'), 9),
 (('население', 'население, человек'), 8),
 (('дата', 'год'), 8),
 (('клуб', 'команда'), 8),
 (('место', 'позиция'), 8),
 (('название', 'описание'), 8),
 (('гражданство', 'страна'), 7),
 (('сезон', 'год'), 7)]

In [27]:
train_counter = find_problematic_labels(model, train_dataloader)


  0%|          | 0/435 [00:00<?, ?it/s]

In [28]:
train_counter.most_common(30)


[(('продюсер', 'сценарист'), 84),
 (('продюсер', 'режиссёр'), 56),
 (('режиссёр', 'сценарист'), 51),
 (('персонаж', 'роль'), 48),
 (('сценарист', 'режиссёр'), 45),
 (('место', 'номер'), 40),
 (('ссылка', 'источник'), 27),
 (('название', 'фильм'), 25),
 (('актёр', 'сценарист'), 24),
 (('место', 'позиция'), 24),
 (('актёр', 'роль'), 23),
 (('имя', 'игрок'), 18),
 (('название', 'песня'), 16),
 (('название', 'альбом'), 16),
 (('клуб', 'команда'), 14),
 (('автор', 'режиссёр'), 14),
 (('роль', 'актёр'), 14),
 (('дата', 'ссылка'), 13),
 (('команда', 'город'), 12),
 (('песня', 'название'), 12),
 (('позиция', 'место'), 11),
 (('команда', 'страна'), 11),
 (('спортсмен', 'имя'), 11),
 (('дата', 'год'), 11),
 (('страна', 'источник'), 10),
 (('источник', 'ссылка'), 10),
 (('режиссёр', 'роль'), 10),
 (('время', 'место'), 10),
 (('актёр', 'режиссёр'), 10),
 (('гражданство', 'дата рождения'), 10)]

In [218]:
#  print(val_paths[9616])

../filteredData2/4312554/table_12.csv


In [244]:
for num in range(0, len(val_paths)):

    df = pd.read_csv(val_paths[num], sep="|")

    tokens = []
    with open(val_paths[num]) as file:
        labels = file.readline().lower().rstrip('\n').split("|")

    if 'возраст' in labels:
        print(num)
        print(labels)
        # print(df.head())

        str_repr_of_column = df['Возраст'].astype(str).str.cat(sep=" ")[:200]
        print(str_repr_of_column)
        print(tokenizer.tokenize(str_repr_of_column, truncation=True, max_length=28))


# print(val_paths[20])
# df = pd.read_csv(val_paths[20], sep="|")
#
# print(df.head())




# print(tokenizer.tokenize("9.4 1.1 1.0 1:1", truncation=True, max_length=28))

# assert len(labels) == len(df.columns)
#
# columns = df.columns[:MAX_COLUMNS]
# labels = labels[:MAX_COLUMNS]
#
#
#
# TOKENS_PER_COLUMN = MAX_TOKENS // len(labels)
#
# for label, _ in zip(df.columns, range(MAX_COLUMNS)):
#     str_repr_of_column = df[label].astype(str).str.cat(sep=" ")[:200]
#     tokens += tokenizer(str_repr_of_column, truncation=True,
#                         max_length=TOKENS_PER_COLUMN).input_ids
#
# labels = labels_encoder.transform(labels)[:MAX_COLUMNS]


5388
['позиция', 'игрок', 'команда', 'возраст', 'рост']
26 24 25 26 24 23 21 19 25 27 20 24 22 30
['26', '24', '25', '26', '24', '23', '21', '19', '25', '27', '20', '24', '22', '30']
5395
['игрок', 'возраст']
16 17 17 18 18 18 18 19 19 19 19 19 19 19 19 19 19 29 20 20
['16', '17', '17', '18', '18', '18', '18', '19', '19', '19', '19', '19', '19', '19', '19', '19', '19', '29', '20', '20']
8936
['имя', 'возраст', 'гражданство']
8 лет 23 года 29 лет
['8', 'лет', '23', 'года', '29', 'лет']
9132
['имя', 'возраст']
29 50 35 21 11 44 24 41 17 38 51 45 35 24 20 21 50 31 58 22 28 20 38 38 31 50 38 49 ? 43 28 49 47 55 41 23 32 28
['29', '50', '35', '21', '11', '44', '24', '41', '17', '38', '51', '45', '35', '24', '20', '21', '50', '31', '58', '22', '28', '20', '38', '38', '31', '50', '38', '49']
9142
['год', 'возраст', 'место проведения', 'место']
22 26
['22', '26']
9448
['номер', 'город', 'имя', 'возраст']
20 19 22 22 19 22 19 19 22 22 18 21 20 20 21 23 22 18 19 21 21 23 20 22 18 21 23 23 19 20 