In [1]:
#!g1.1
import pandas as pd
import numpy as np
from sklearn import preprocessing
import regex as re

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import torch

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
#!g1.1
df = pd.read_csv('/home/jupyter/mnt/s3/generalvibe/data/clean_train.csv')

# le = preprocessing.LabelEncoder()
# le.fit(df.sentiment)
# df['sentiment'] = le.transform(df.sentiment)
df

Unnamed: 0,sentence,1category,sentiment,length
0,При этом всегда получал качественные услуги.,Communication,+,44
1,Отвратительное отношение к клиентам.,Communication,−,36
2,"Всегда в любое время дня и ночи помогут, ответ...",Communication,+,56
3,"Все время согласовывалось, всё делалось быстро.",Communication,+,47
4,Абсолютное бездействие и нежелание банка работ...,Quality,−,62
...,...,...,...,...
6955,"Ни заявления, ни документа, подтверждающего оп...",Quality,−,71
6956,И в последний момент (так как карта до августа...,Communication,−,169
6957,"Хотелось бы так же прояснить, что до сложившей...",Quality,+,89
6958,Никогда и ни в коем случае не открывайте счет ...,Communication,−,109


In [30]:
#!g1.1
def label_encoding(label):
    enc_label = None
    if label == '+':
        enc_label = 0
    if label == '?':
        enc_label = 1
    if label == '−':
        enc_label = 2
    return enc_label

df.sentiment = df.sentiment.apply(label_encoding)
df

Unnamed: 0,sentence,1category,sentiment,length
0,При этом всегда получал качественные услуги.,Communication,0,44
1,Отвратительное отношение к клиентам.,Communication,2,36
2,"Всегда в любое время дня и ночи помогут, ответ...",Communication,0,56
3,"Все время согласовывалось, всё делалось быстро.",Communication,0,47
4,Абсолютное бездействие и нежелание банка работ...,Quality,2,62
...,...,...,...,...
6955,"Ни заявления, ни документа, подтверждающего оп...",Quality,2,71
6956,И в последний момент (так как карта до августа...,Communication,2,169
6957,"Хотелось бы так же прояснить, что до сложившей...",Quality,0,89
6958,Никогда и ни в коем случае не открывайте счет ...,Communication,2,109


In [None]:
#!g1.1


In [31]:
#!g1.1
df.sentiment.value_counts()

2    4026
0    2551
1     383
Name: sentiment, dtype: int64

In [32]:
#!g1.1
df['1category'].value_counts()

Communication    3615
Quality          3015
Price             193
Safety            137
Name: 1category, dtype: int64

In [33]:
#!g1.1
stop_words = set(stopwords.words('russian'))

In [39]:
#!g1.1
df['sentence_processed'] = df['sentence'].apply(lambda text: ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words]))
# df['sentence_processed'] = df['sentence_processed'].apply(lambda text: re.sub(" \d+", " ", text))

In [40]:
#!g1.1
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model = model.to('cuda')

loading configuration file https://huggingface.co/ProsusAI/finbert/resolve/main/config.json from cache at /tmp/xdg_cache/huggingface/transformers/2120f4f96b5830e5a91fe94d242471b0133b0976c8d6e081594ab837ac5f17bc.ef97278c578016c8bb785f15296476b12eae86423097fed78719d1c8197a3430
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute"

In [41]:
#!g1.1
X = list(df["sentence_processed"])
y = list(df["sentiment"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [99]:
#!g1.1
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

    
def compute_metrics(p):
    print(type(p))
    proba, labels = p
    pred = np.argmax(proba, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='macro')
    precision = precision_score(y_true=labels, y_pred=pred, average='macro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='macro')
#     roc_auc = roc_auc_score(y_true=labels, y_score=pred, average='macro')
#     roc_auc = roc_auc_score(labels, proba, multi_class="ovr", average="weighted")
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
#     return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "roc_auc": roc_auc}
#     return {"accuracy": accuracy, "f1": f1}


In [100]:
#!g1.1
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [101]:
#!g1.1
STEPS = 1000

training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=32,
    max_steps=STEPS,
    save_steps=200,
    learning_rate=3e-5,
    weight_decay=0.1,
    warmup_steps=STEPS/10,
    eval_steps=100,
    logging_steps=100,
    seed=42,
    evaluation_strategy='steps',
    logging_strategy='steps',
#     save_strategy='no',
    load_best_model_at_end=True,
    logging_first_step=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# trainer.train()

max_steps is given, it will override any value given in num_train_epochs


In [46]:
#!g1.1
trainer.train()

***** Running training *****
  Num examples = 5568
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1000


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 1392
  Batch size = 8


<class 'transformers.trainer_utils.EvalPrediction'>


  _warn_prf(average, modifier, msg_start, len(result))


ValueError: Target scores need to be probabilities for multiclass roc_auc, i.e. they should sum up to 1.0 over classes

In [None]:
#!g1.1
trainer.eval()

In [None]:
#!g1.1
trainer.save_model(f'tuned_model{i}')

In [None]:
#!g1.1
tuned_model = BertForSequenceClassification.from_pretrained("CustomModel")
tuned_model.to('cuda')

In [27]:
#!g1.1
text = "Банк смог сделать все отлично"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.9354689 , 0.0618398 , 0.00269118]], dtype=float32)

In [47]:
#!g1.1
!ssh-keygen -f ~/.ssh/id_rsa -q -N ""

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [48]:
#!g1.1
!cat ~/.ssh/id_rsa.pub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDbs+JsDWwgHzJAR7joHGBM946SHiqh0ujmRin8ArRau8kPm8keGfhF7kfa4hfl4BIgeIlbAXzKGDPRMeEVMy0hIxzAnGi3WuqEnOyc8MJQXJtqYJqT1b7ScmMwoXkble0bi3C8gmAeujc1zwFSthmr0z+qLcJK/cVtsRNAo6/EU18oMun80eVRZ//+De/wNNylwAAS1NrC9rER6hMTBNgaLmJyMCljFPXiv0/rHJfPnMXV0DW4o8q33iYMMlMJPTXbjg5RSdu0pLBF+MC+dlpi7tECWHLm73cdpHcP49yc9a/v3qSVNSojQwUu7dE9DKsVp+QZaG6cuHwhAeUoVuRN jupyter@l-32e44f28-5ec6-48c1-81ea-1386f82f21f2


In [117]:
#!g1.1
# !pwd
!git remote add origin git@github.com:Chrome1278/HSE_DSH_ThunderForward.git
# !git remote -v
# !ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
# !git config --global user.email "sacha.leovin@yandex.ru"
# !git config --global user.name "leovin"
# !git add -A
# !git reset

In [125]:
#!g1.1
!git 

* main


In [63]:
#!g1.1
!ls -la

total 49452
drwxr-xr-x 1 jupyter jupyter      374 Apr  9 11:12 .
drwxrwxr-x 1 jupyter jupyter       44 Apr  8 09:33 ..
drwxr-xr-x 1 jupyter jupyter      108 Apr  9 11:13 .git
drwxr-xr-x 1 jupyter jupyter      438 Apr  9 09:23 .ipynb_checkpoints
-rw-r--r-- 1 jupyter root       28583 Apr  9 08:23 Untitled.ipynb
-rw-r--r-- 1 jupyter root    50458694 Apr  9 09:13 Untitled1-Copy2.ipynb
-rw-r--r-- 1 jupyter root       10263 Apr  8 10:08 dataflow_en.ipynb
-rw-r--r-- 1 jupyter root       15229 Apr  9 05:05 dataflow_ru.ipynb
-rw-r--r-- 1 jupyter root       25129 Apr  8 09:50 environment_ru.ipynb
-rw-r--r-- 1 jupyter root       18763 Apr  9 07:03 modeltraining_ru.ipynb
drwxr-xr-x 1 jupyter jupyter       44 Apr  9 11:01 output
-rw-r--r-- 1 jupyter root        7414 Apr  9 08:50 prepare_dataset.ipynb
-rw-r--r-- 1 jupyter root       57547 Apr  9 11:12 training_transformers.ipynb


In [66]:
#!g1.1
%%file .gitignore
.ipynb_checkpoints
output/

Overwriting .gitignore


In [116]:
#!g1.1
# !git status
# !git commit -a -m "some fix"
!git pull origin main
# --set-upstream origin main
#!git init

fatal: 'origin' does not appear to be a git repository
fatal: Could not read from remote repository.

Please make sure you have the correct access rights
and the repository exists.


Exception: Process exited with code 1

In [108]:
#!g1.1
# !git add -A
# !git status
# !git commit -a -m 'first commit'
!git push --set-upstream origin main 

To github.com:Chrome1278/HSE_DSH_ThunderForward.git
 ! [rejected]        main -> main (non-fast-forward)
error: failed to push some refs to 'git@github.com:Chrome1278/HSE_DSH_ThunderForward.git'
hint: Updates were rejected because the tip of your current branch is behind
hint: its remote counterpart. Integrate the remote changes (e.g.
hint: 'git pull ...') before pushing again.
hint: See the 'Note about fast-forwards' in 'git push --help' for details.


Exception: Process exited with code 1

In [111]:
#!g1.1
!git init