In [1]:
!pip install evaluate --quiet

In [2]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import ast
import evaluate

2024-06-16 13:20:55.039995: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-16 13:20:55.040151: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-16 13:20:55.165585: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
df = pd.read_csv('/kaggle/input/hack-samolet/train_data.csv')
df_nofull = df[df.target_labels_positions != '{}']

In [4]:
train_nofull, val = train_test_split(df_nofull, test_size=0.1, random_state=42)

In [5]:
train_full = df.drop(val.index)

In [6]:
tokens_nofull = train_nofull.processed_text.str.split(' ').reset_index().processed_text
tokens_val = val.processed_text.str.split(' ').reset_index().processed_text
tokens_full = train_full.processed_text.str.split(' ').reset_index().processed_text; tokens_full

0       [аа, союзная, тридцать, пять, дробь, один, лар...
1       [аа, приложение, мне, показывает, к, оплате, у...
2       [а, что, добрый, день, NAME, у, меня, пришел, ...
3       [у, меня, западный, с, утра, да, да, еще, да, ...
4       [NAME, ну, а, по, поводу, ипотеки, по, моему, ...
                              ...                        
3343    [а, доброе, утро, меня, заинтересовала, ваш, п...
3344    [здравствуйте, меня, зовут, иван, я, бы, хотел...
3345    [целенаправлен, на, голосовой, почтовый, ящик,...
3346    [NAME, зовут, NAME, я, хотела, уточнить, ээ, с...
3347    [далее, здравствуйте, NAME, зовут, так, хотел,...
Name: processed_text, Length: 3348, dtype: object

In [7]:
dicts_full = train_full.target_labels_positions.apply(ast.literal_eval).reset_index().target_labels_positions
dicts_val = val.target_labels_positions.apply(ast.literal_eval).reset_index().target_labels_positions
dicts_nofull = train_nofull.target_labels_positions.apply(ast.literal_eval).reset_index().target_labels_positions

In [8]:
def create_tags(tokens, dicts):
    tags = []
    text_lengths = tokens.apply(len)

    for length, labels in zip(text_lengths, dicts):
        index_label = [(key, pos) for key, positions in labels.items() for pos in positions]
        result = ['O'] * length
        for i in index_label:
            result[i[1]] = i[0]
        tags.append(result)
    return tags

In [9]:
final_set_full = pd.DataFrame({'tokens': tokens_full.values, 'tags': create_tags(tokens_full, dicts_full)})
final_set_nofull = pd.DataFrame({'tokens': tokens_nofull.values, 'tags': create_tags(tokens_nofull, dicts_nofull)})
final_set_val = pd.DataFrame({'tokens': tokens_val.values, 'tags': create_tags(tokens_val, dicts_val)})

In [10]:
final_set_full.head()

Unnamed: 0,tokens,tags
0,"[аа, союзная, тридцать, пять, дробь, один, лар...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[аа, приложение, мне, показывает, к, оплате, у...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[а, что, добрый, день, NAME, у, меня, пришел, ...","[O, O, O, O, O, O, O, O, O, O, O, O, B-discoun..."
3,"[у, меня, западный, с, утра, да, да, еще, да, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[NAME, ну, а, по, поводу, ипотеки, по, моему, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [11]:
label_list = ['O', 'I-value', 'B-value', 'B-discount']
label_list

['O', 'I-value', 'B-value', 'B-discount']

In [12]:
dataset_hf_full = datasets.DatasetDict(
    {'train': datasets.Dataset.from_pandas(final_set_full),
     'val': datasets.Dataset.from_pandas(final_set_val)}
)
dataset_hf_full

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 3348
    })
    val: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 51
    })
})

In [13]:
dataset_hf_nofull = datasets.DatasetDict(
    {'train': datasets.Dataset.from_pandas(final_set_nofull),
     'val': datasets.Dataset.from_pandas(final_set_val)}
)
dataset_hf_nofull

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 452
    })
    val: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 51
    })
})

In [14]:
tokenizer = AutoTokenizer.from_pretrained('sergeyzh/rubert-mini-sts')

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [15]:
def tokenize_and_align_labels(example, label_all_tokens = True):
    tokenized_input = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(example['tags']):
        word_ids = tokenized_input.word_ids(batch_index=i) # returns a list indicating the word corresponding to each token
        previous_word_idx = None

        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]
        labels.append(label_ids)
    tokenized_input['labels'] = labels
    return tokenized_input

In [16]:
tokenized_dataset_full = dataset_hf_full.map(tokenize_and_align_labels, batched=True)
tokenized_dataset_nofull = dataset_hf_nofull.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3348 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

In [51]:
model_nofull = AutoModelForTokenClassification.from_pretrained('sergeyzh/rubert-mini-sts', num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at sergeyzh/rubert-mini-sts and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model_full = AutoModelForTokenClassification.from_pretrained('sergeyzh/rubert-mini-sts', num_labels=len(label_list))

config.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/130M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at sergeyzh/rubert-mini-sts and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at sergeyzh/rubert-mini-sts and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments('test-ner',
                         eval_strategy='epoch',
                         learning_rate=0.00002,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         num_train_epochs=20,
                         weight_decay=0.01,
                         logging_steps=15,
                         report_to='none')

In [53]:
data_collator = DataCollatorForTokenClassification(tokenizer) # forms a batch

In [54]:
from sklearn.metrics import f1_score

def compute_metrics(eval_preds):
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)

    predictions = [
        [label_list[prediction] for (prediction, label) in zip(pred, true_label) if label != -100]
          for pred, true_label in zip(pred_logits, labels)
    ]

    true_labels = [
        [label_list[label] for (prediction, label) in zip(pred, true_label) if label != -100]
          for pred, true_label in zip(pred_logits, labels)
    ]

    class_weights = {'O': 0.003, 'B-discount': 1, 'B-value': 2, 'I-value': 2}
    sample_weight = [[class_weights[label] for label in seq] for seq in true_labels]
    sample_weight = [item for sublist in sample_weight for item in sublist]
    
    predictions_flat = [item for sublist in predictions for item in sublist]
    true_labels_flat = [item for sublist in true_labels for item in sublist]
    
    results = f1_score(true_labels_flat, predictions_flat, average='weighted', sample_weight=sample_weight)
    
    return {
        'f1_weighted': results
    }

In [55]:
trainer_nofull = Trainer(
    model_nofull,
    args,
    train_dataset=tokenized_dataset_nofull['train'],
    eval_dataset=tokenized_dataset_nofull['val'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_full = Trainer(
    model_full,
    args,
    train_dataset=tokenized_dataset_full['train'],
    eval_dataset=tokenized_dataset_full['val'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [56]:
# import os
# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [58]:
trainer_nofull.train()



Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0463,0.031181,0.458103
2,0.0425,0.027774,0.395638
3,0.0433,0.028659,0.668197
4,0.0415,0.025402,0.511229
5,0.0366,0.026752,0.693501
6,0.0343,0.026515,0.696851
7,0.033,0.025827,0.70718
8,0.032,0.024822,0.721267
9,0.0315,0.02509,0.758632
10,0.0368,0.02433,0.740473




TrainOutput(global_step=300, training_loss=0.03275359143813451, metrics={'train_runtime': 116.5211, 'train_samples_per_second': 77.582, 'train_steps_per_second': 2.575, 'total_flos': 158908353256128.0, 'train_loss': 0.03275359143813451, 'epoch': 20.0})

In [74]:
trainer_full.train()



Epoch,Training Loss,Validation Loss,F1 Weighted
1,0.0046,0.024347,0.675254
2,0.0041,0.025317,0.672603
3,0.0037,0.027417,0.661704
4,0.0043,0.027187,0.697817
5,0.0049,0.027783,0.661646
6,0.0027,0.030229,0.675579
7,0.0023,0.028849,0.626709
8,0.0033,0.027549,0.689155
9,0.0019,0.029863,0.667666
10,0.0028,0.030356,0.642907




TrainOutput(global_step=2100, training_loss=0.0034187019678453604, metrics={'train_runtime': 783.4483, 'train_samples_per_second': 85.468, 'train_steps_per_second': 2.68, 'total_flos': 1186367982695616.0, 'train_loss': 0.0034187019678453604, 'epoch': 20.0})

In [75]:
#model_nofull.save_pretrained('ner_model_nofull')
model_full.save_pretrained('ner_model_full')
#tokenizer.save_pretrained('tokenizer')

In [76]:
id2label = {
    str(i): label for i, label in enumerate(label_list)
}

label2id = {
    label: str(i) for i, label in enumerate(label_list)
}

In [77]:
import json
config_full = json.load(open('/kaggle/working/ner_model_full/config.json'))
config_nofull = json.load(open('/kaggle/working/ner_model_nofull/config.json'))

In [78]:
config_full['id2label'] = id2label
config_full['label2id'] = label2id
config_nofull['id2label'] = id2label
config_nofull['label2id'] = label2id

json.dump(config_full, open('/kaggle/working/ner_model_full/config.json', 'w'))
json.dump(config_nofull, open('/kaggle/working/ner_model_nofull/config.json', 'w'))

In [79]:
model_finetuned_full = AutoModelForTokenClassification.from_pretrained('/kaggle/working/ner_model_full')
model_finetuned_nofull = AutoModelForTokenClassification.from_pretrained('/kaggle/working/ner_model_nofull')

In [80]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/tokenizer')

In [81]:
from transformers import pipeline
nlp_full = pipeline('ner', model=model_finetuned_full, tokenizer=tokenizer)
nlp_nofull = pipeline('ner', model=model_finetuned_nofull, tokenizer=tokenizer)

In [82]:
# Function to reconstruct words from subwords
def reconstruct_words(ner_results, tokenizer):
    reconstructed_results = []
    temp_word = ""
    start_idx = None
    end_idx = None
    entity = None

    for token in ner_results:
        if token['word'].startswith("##"):
            temp_word += token['word'][2:]
            end_idx = token['end']
        else:
            if temp_word:
                reconstructed_results.append({'entity': entity, 'score': token['score'], 'word': temp_word, 'start': start_idx, 'end': end_idx})
            temp_word = token['word']
            start_idx = token['start']
            end_idx = token['end']
            entity = token['entity']

    # Append the last token
    if temp_word:
        reconstructed_results.append({'entity': entity, 'score': token['score'], 'word': temp_word, 'start': start_idx, 'end': end_idx})
    
    return reconstructed_results


In [83]:
pred2_full = nlp_full([' '.join(tokens) for tokens in dataset_hf_full['val']['tokens']]) # inefficient since we don't need the exact probabilities now
pred2_nofull = nlp_nofull([' '.join(tokens) for tokens in dataset_hf_nofull['val']['tokens']])

In [84]:
def find_word_index_by_char_range(s, start_idx, end_idx):
    words = s.split()
    
    current_char_pos = 0
    
    word_indices = []
    
    for i, word in enumerate(words):
        word_start_pos = current_char_pos
        word_end_pos = current_char_pos + len(word) - 1
        
        if word_start_pos <= end_idx and word_end_pos >= start_idx:
            word_indices.append(i)
            
        current_char_pos += len(word) + 1
    
    return word_indices

initial_string = "Find the word index by the character range in the initial string"
start_index = 5
end_index = 22

print(find_word_index_by_char_range(initial_string, start_index, end_index))

[1, 2, 3, 4]


In [85]:
def get_initial_index(prediction, initial_string):
    sentence = ' '.join(initial_string)
    entities = {}
    for i in prediction:
        for j in find_word_index_by_char_range(s=sentence, start_idx=i['start'], end_idx=i['end']):
            entities[j] = i['entity']
    return entities

In [86]:
preds_final_full = []
preds_final_nofull = []

for prediction, initial_string in zip(pred2_full, dataset_hf_full['val']['tokens']):
    initial_indicies = preds_final_full.append(get_initial_index(prediction, initial_string))
for prediction, initial_string in zip(pred2_nofull, dataset_hf_nofull['val']['tokens']):
    initial_indicies = preds_final_nofull.append(get_initial_index(prediction, initial_string))
    
preds_final_full[:10]

[{},
 {116: 'B-discount',
  442: 'B-discount',
  443: 'B-value',
  444: 'I-value',
  450: 'B-discount'},
 {236: 'B-discount'},
 {306: 'B-discount', 309: 'B-value', 310: 'I-value'},
 {},
 {91: 'B-discount', 92: 'I-value', 93: 'B-value', 94: 'I-value'},
 {12: 'B-discount',
  17: 'B-value',
  18: 'I-value',
  43: 'B-value',
  56: 'B-value',
  57: 'I-value',
  72: 'B-discount',
  93: 'B-value'},
 {14: 'B-discount', 18: 'B-value', 19: 'I-value'},
 {},
 {19: 'I-value',
  20: 'B-discount',
  21: 'B-value',
  22: 'I-value',
  47: 'B-discount',
  48: 'B-value',
  49: 'I-value',
  61: 'I-value',
  83: 'I-value',
  84: 'B-discount',
  235: 'B-discount',
  236: 'B-value',
  237: 'I-value',
  257: 'I-value',
  283: 'I-value'}]

In [87]:
preds_val_full = []
preds_val_nofull = []
for prediction, tokens in zip(preds_final_full, dataset_hf_full['val']['tokens']):
    result = ['O'] * len(tokens)
    for key, entity in prediction.items():
        result[key] = entity
    preds_val_full.append(result)
for prediction, tokens in zip(preds_final_nofull, dataset_hf_nofull['val']['tokens']):
    result = ['O'] * len(tokens)
    for key, entity in prediction.items():
        result[key] = entity
    preds_val_nofull.append(result)

In [88]:
score_full = []
score_nofull = []
class_weights = {'O': 0.003, 'B-discount': 1, 'B-value': 2, 'I-value': 2}
for i in range(len(preds_val_full)):
    sample_weight = [class_weights[label] for label in dataset_hf_full['val']['tags'][i]]
    score_full.append(f1_score(dataset_hf_full['val']['tags'][i], preds_val_full[i][:len(dataset_hf_full['val']['tags'][i])], average='weighted', sample_weight=sample_weight))

for i in range(len(preds_val_nofull)):
    sample_weight = [class_weights[label] for label in dataset_hf_nofull['val']['tags'][i]]
    score_nofull.append(f1_score(dataset_hf_nofull['val']['tags'][i], preds_val_nofull[i][:len(dataset_hf_nofull['val']['tags'][i])], average='weighted', sample_weight=sample_weight))

In [89]:
print(np.mean(score_full))
print(np.mean(score_nofull))

0.5790931112371377
0.7770718375791306


In [None]:
test = pd.read_csv('/kaggle/input/samolet/gt_test.csv')

In [None]:
pred2 = nlp([' '.join(tokens) for tokens in dataset_hf['val']['tokens']]) # inefficient since we don't need the exact probabilities now

In [None]:
pred_f = nlp(list(test['processed_text'].values))

In [None]:
list(test['processed_text'].values)[14]

In [None]:
preds_final2 = []
for prediction, initial_string in zip(pred_f, test['processed_text'].str.split()):
    initial_indicies = preds_final2.append(get_initial_index(prediction, initial_string))
    
preds_final2[:10]

In [None]:
preds_test = []
for prediction, tokens in zip(preds_final2, test['processed_text'].str.split()):
    result = ['O'] * len(tokens)
    for key, entity in prediction.items():
        result[key] = entity
    preds_test.append(result)

In [None]:
len(preds_test)

In [None]:
test['label'] = preds_test

In [None]:
test.to_csv('test_preds.csv', index = False)