In [1]:
import os
import re
import time
import math
import fasttext
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter

from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_value_
from torch.nn.utils.rnn import pad_sequence
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import CosineAnnealingLR

from tqdm.notebook import tqdm
import warnings 
warnings.filterwarnings('ignore')

import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()

In [2]:
!nvidia-smi

Mon Sep 19 12:35:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
# ====================================================
# Data Loading
# ====================================================
df_issues_train = pd.read_csv('../input/agu-comp/train_issues.csv')
df_comment_train = pd.read_csv("../input/agu-comp/train_comments.csv")

df_issues_test = pd.read_csv('../input/agu-comp/test_issues.csv')
df_comment_test = pd.read_csv("../input/agu-comp/test_comments.csv")

df_employees = pd.read_csv("../input/agu-comp/employees.csv")
print(f"train.shape: {df_issues_train.shape}")
display(df_issues_train.head())
display(df_comment_train.head())
display(df_employees.head())

train.shape: (9589, 8)


Unnamed: 0,id,created,key,summary,project_id,assignee_id,creator_id,overall_worklogs
0,819952,2019-10-01 05:57:18.000,SM-10678,"UI тесты по заказу ""Добро КейДжи""",5,93,93,1800
1,819949,2019-10-01 05:59:45.000,SM-10679,"UI тесты раздела ""Профиль""",5,93,93,7200
2,819947,2019-10-01 06:00:38.000,SM-10680,"UI тесты раздела ""Личный счет""",5,93,93,14400
3,819943,2019-10-01 06:02:49.000,SM-10682,"UI тесты раздела ""Новости""",5,93,93,900
4,819941,2019-10-01 06:03:26.000,SM-10683,"UI тесты раздела ""Зоны скидок и доплат""",5,93,93,900


Unnamed: 0,comment_id,text,issue_id,author_id
0,11779,[https://www.youtube.com/watch?v=tuhOdtsvoNY|h...,669666,1
1,10601,OK [~accountid:557058:3f7ab89a-8969-4547-90df-...,669670,1
2,76101,I encountered a problem with access to `/users...,670930,2
3,76102,I have learned that `users/:id/emails` endpoin...,670930,2
4,76213,We have decided with Andrew to set member's em...,670930,2


Unnamed: 0,id,active,full_name,position,hiring_type,payment_type,salary_calculation_type,english_level,passport,is_nda_signed,is_labor_contract_signed,is_added_to_internal_chats,is_added_one_to_one
0,1,1,David Courtney,,,,,,0,0,0,0,0
1,2,0,Dan Guerra,Web-разработчик,staff,fixed,,,0,0,0,0,0
2,4,0,Grady Smith,Web-разработчик,staff,fixed,,,0,0,0,0,0
3,6,0,James Powell,Разработчик мобильных приложений,staff,fixed,,,0,0,0,0,0
4,8,1,John Brown,Разработчик мобильных приложений,staff,fixed,,,1,1,1,1,1


In [4]:
import re

def clean_and_join(txt):
    txt = " SEP ".join(txt)
    txt = re.sub('\xa0', ' ', txt)
    txt = re.sub('\n', ' ', txt)
    txt = re.sub('\t', ' ', txt)
    txt = re.sub('\r', ' ', txt)
    txt = re.sub(r'\[.+?\]', ' ', txt)
    txt = re.sub(r'!.+?!', ' ', txt)
    txt = re.sub('http\S+', 'URL', txt)
    txt = re.sub(' +', ' ', txt)
    txt = re.sub('SEP', '<s>', txt)
    
    return txt.strip()

df_comment_train_group = df_comment_train.groupby(by='issue_id')['text'].agg(list).to_frame().reset_index()
df_comment_train_group['comm_authors'] = df_comment_train.groupby(by='issue_id')['author_id'].agg('nunique').to_frame().reset_index()['author_id']
df_comment_train_group.text = df_comment_train_group.text.apply(clean_and_join)

df_comment_test_group = df_comment_test.groupby(by='issue_id')['text'].agg(list).to_frame().reset_index()
df_comment_test_group['comm_authors'] = df_comment_test.groupby(by='issue_id')['author_id'].agg('nunique').to_frame().reset_index()['author_id']
df_comment_test_group['text'] = df_comment_test_group['text'].apply(clean_and_join)

In [5]:
dftr = pd.merge(df_issues_train, df_comment_train_group, left_on="id", right_on="issue_id", how='left').reset_index(drop=True)
dftr['comm_authors'] = dftr['comm_authors'].fillna(-1)
dftr['text'] = dftr['text'].fillna('')
dftr['full_text'] = '[TITLE]' + ' ' + dftr['summary'] + ' ' + '[COMMENT]' + ' ' + dftr['text']

In [6]:
dfte = pd.merge(df_issues_test, df_comment_test_group, left_on="id", right_on="issue_id", how='left').reset_index(drop=True)
dfte['comm_authors'] = dfte['comm_authors'].fillna(-1)
dfte['text'] = dfte['text'].fillna('')
dfte['full_text'] = '[TITLE]' + ' ' + dfte['summary'] + ' ' + '[COMMENT]' + ' ' + dfte['text']

In [7]:
dftr = dftr.drop_duplicates(subset='full_text').reset_index(drop=True)

In [8]:
val = dftr.sample(960, random_state=42).reset_index(drop=True)
train = dftr[~dftr['id'].isin(val['id'])].reset_index(drop=True)
train = pd.concat([train, dfte]).reset_index(drop=True)

In [9]:
train = train.drop_duplicates(subset='full_text').reset_index(drop=True)

In [10]:
with open('train_texts_all.txt', 'w') as f:
    for item in train.full_text.values:
        f.write("%s\n" % item)
        
with open('dev_texts_all.txt', 'w') as f:
    for item in val.full_text.values:
        f.write("%s\n" % item)

In [11]:
"""
This file runs Masked Language Model. You provide a training file. Each line is interpreted as a sentence / paragraph.
Optionally, you can also provide a dev file.
The fine-tuned model is stored in the output/model_name folder.
Usage:
python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]
"""

from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
import sys
import gzip
from datetime import datetime

os.environ["WANDB_DISABLED"] = "true"

model_name = 'xlm-roberta-base'
per_device_train_batch_size = 4
#device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

save_steps = 1000               #Save model every n k steps
num_train_epochs = 2      #Number of epochs
use_fp16 = True               #Set to True, if your GPU supports FP16 operations
max_length = 256               #Max length for a text input
do_whole_word_mask = False       #If set to true, whole words are masked
mlm_prob = 0.15                 #Probability that a word is replaced by a [MASK] token

# Load the model
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

output_dir = "./{}".format(model_name.replace("/", "_"))
print("Save checkpoints to:", output_dir)

##### Load our training datasets

train_sentences = []
train_path = './train_texts_all.txt'
with gzip.open(train_path, 'rt', encoding='utf8') if train_path.endswith('.gz') else  open(train_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip()
        if len(line) >= 10:
            train_sentences.append(line)
            
dev_sentences = []
train_path = './dev_texts_all.txt'
with gzip.open(train_path, 'rt', encoding='utf8') if train_path.endswith('.gz') else  open(train_path, 'r', encoding='utf8') as fIn:
    for line in fIn:
        line = line.strip()
        if len(line) >= 10:
            dev_sentences.append(line)

print("Train sentences:", len(train_sentences))
print("Dev sentences:", len(dev_sentences))

#A dataset wrapper, that tokenizes our data on-the-fly
class TokenizedSentencesDataset:
    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length
        self.cache_tokenization = cache_tokenization

    def __getitem__(self, item):
        if not self.cache_tokenization:
            return self.tokenizer(self.sentences[item], padding=True, add_special_tokens=True, truncation=True, 
                                  max_length=self.max_length, return_special_tokens_mask=True)

        if isinstance(self.sentences[item], str):
            self.sentences[item] = self.tokenizer(self.sentences[item], padding=True, add_special_tokens=True, truncation=True, 
                                                  max_length=self.max_length, return_special_tokens_mask=True)
        return self.sentences[item]

    def __len__(self):
        return len(self.sentences)

train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
#dev_sentences = []
dev_dataset = TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True) if len(dev_sentences) > 0 else None


##### Training arguments

if do_whole_word_mask:
    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
else:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps" if dev_dataset is not None else "no",
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size,
    group_by_length=True,
    learning_rate=2e-5,
    weight_decay=1e-2,
    warmup_ratio=0.05,
    eval_steps=save_steps,
    save_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=1,
    prediction_loss_only=True,
    gradient_accumulation_steps=1,
    fp16=use_fp16,
    seed=42,
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

print("Save tokenizer to:", output_dir)
tokenizer.save_pretrained(output_dir)

trainer.train()

print("Save model to:", output_dir)
model.save_pretrained(output_dir)

print("Training done")

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Save checkpoints to: ./xlm-roberta-base
Train sentences: 9457
Dev sentences: 960


Using cuda_amp half precision backend
tokenizer config file saved in ./xlm-roberta-base/tokenizer_config.json
Special tokens file saved in ./xlm-roberta-base/special_tokens_map.json


Save tokenizer to: ./xlm-roberta-base


***** Running training *****
  Num examples = 9457
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 4730


Step,Training Loss,Validation Loss
1000,2.3098,2.010332
2000,1.838,1.77122
3000,1.7199,1.778475
4000,1.6725,1.681837


***** Running Evaluation *****
  Num examples = 960
  Batch size = 4
Saving model checkpoint to ./xlm-roberta-base/checkpoint-1000
Configuration saved in ./xlm-roberta-base/checkpoint-1000/config.json
Model weights saved in ./xlm-roberta-base/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 960
  Batch size = 4
Saving model checkpoint to ./xlm-roberta-base/checkpoint-2000
Configuration saved in ./xlm-roberta-base/checkpoint-2000/config.json
Model weights saved in ./xlm-roberta-base/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [xlm-roberta-base/checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 960
  Batch size = 4
Saving model checkpoint to ./xlm-roberta-base/checkpoint-3000
Configuration saved in ./xlm-roberta-base/checkpoint-3000/config.json
Model weights saved in ./xlm-roberta-base/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [xlm-roberta-base/checkpoint-2000] due to args.save_t

Save model to: ./xlm-roberta-base


Model weights saved in ./xlm-roberta-base/pytorch_model.bin


Training done
