# NLP test task for BST

## Data loading

In [1]:
import pandas as pd

In [8]:
df_train = pd.read_csv('train_tin.csv', encoding='cp1251')
df_test = pd.read_csv('test_tin.csv', encoding='cp1251')

In [14]:
df_train.head()

Unnamed: 0,text,isPositive
0,"27.09.19 через сайт в разделе ""Рефинансировани...",0
1,Добрый день! Неоднократно поступают звонки по ...,0
2,В первый раз решила воспользоваться кредитной ...,0
3,На самом дело уже накипело из-за участившегося...,0
4,Если вы дорожите своими нервами - ни при каких...,0


## Yay, transformers

In [16]:
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast
import torch

# well, transformers, obvioulsy
tokenizer = BertTokenizerFast.from_pretrained(
    'blanchefort/rubert-base-cased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained(
    'blanchefort/rubert-base-cased-sentiment', return_dict=True)


def predict(text):
    inputs = tokenizer(text, max_length=512, padding=True,
                       truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
    return predicted


Downloading:   0%|          | 0.00/499 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/943 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679M [00:00<?, ?B/s]

In [33]:
pred_labels = {0 : 'neutral', 1 : 'positive', 2 : 'negative'}
labels = {0 : 'negative', 1 : 'positive'}
# сonverting from nums to something nice

In [None]:
# checking accuracy on 'train' subset
preds = pd.DataFrame({'label' : [], 'prediction': []})
for idx in range(len(df_train)):
    pred = predict(df_train['text'].iloc[idx])

    pred_label = pred_labels[pred[0]]
    actual_label = labels[df_train['isPositive'].iloc[idx]]

    preds = preds.append({'label' : actual_label, 'prediction': pred_label}, ignore_index = True)

In [59]:
preds = preds.replace('neutral', 'negative')

In [64]:
sum(preds['label'] == preds['prediction']) / len(preds) * 100

98.2981220657277

Ight, 98% accuracy, good enought I guess?

In [None]:
# making predictions 
preds_test = pd.DataFrame({'prediction': []})
correct = 0
for idx in range(len(df_test)):
    pred = predict(df_test['text'].iloc[idx])

    pred_label = pred_labels[pred[0]]

    preds_test = preds_test.append(
        {'prediction': pred_label}, ignore_index=True)
    if pred_label == actual_label:
        correct += 1

In [87]:
preds_test['prediction']

0      negative
1      positive
2      positive
3      negative
4      positive
         ...   
995    negative
996    positive
997    negative
998     neutral
999    negative
Name: prediction, Length: 1000, dtype: object

In [91]:
# I don not agree with the idea of not using 'neutral', but what can I say...
#   changing human-readable into nums
#   and commiting to the idea 'if its not positive - its negative'
preds_test = preds_test.replace('negative', 0)
preds_test = preds_test.replace('neutral', 0)
preds_test = preds_test.replace('positive', 1)

In [92]:
df_test['isPositive'] = preds_test['prediction']
df_test

Unnamed: 0,text,isPositive
0,Добрый день! Я являюсь клиентом Тинькофф банк ...,0
1,Хочу выразить огромную благодарность банку Тин...,1
2,Выражаю благодарность К-ву Александру В. за ст...,1
3,В январе 2019 года оформила потребительский кр...,0
4,Добрый день. Хочу поблагодарить банк Тинькофф ...,1
...,...,...
995,Приветствую! 18.02 хотел совершить несколько п...,0
996,Сотрудник Шахноза (6904552) очень грамотно и б...,1
997,"Добрый день!Я являюсь клиентом банка, теперь у...",0
998,Столкнулись с проблемой: нужно было провести п...,0


In [94]:
df_test.to_csv('predictions_nlp.csv', index = False)

## What else could have been used?

Well  
Transformers have been showing SOTA results for some time in NLP *(and CV for that matter...)*, pretrained models are widely availible and are crazilly easy to use. Why whould anyone else use anything other than transformers for this task?  

Yeah, I need to show my expertise in this field and thats the whole idea I guess...  
Let me just say what could've been done to solve this problem (if today was like 2017):  

- `Rule based` aprroach, such as `Bag of words`: assining weight to each words and calculiting the result *(kindergarden way)*
- `Basic machine learning models`, such as `Naive Bayes`: we can remove stop words, *vectorize* text, construct `DTM`, use some sort of `dimensionality reduction` algorythm and train the model. *(high scool way)*
- `Deep learning`, such as using `LSTM` - turn text into word embeddings and train the model *(pre 2017 'good' way I guess)*
- `Pre-trained transformers`, like basic Transformer, BERT (or its variations), ERNIE, etc. It would be a bad idea to train transformer-like model from scratch yourself, so we might want to finetune an already existing one. But do we really need to, if we already have models 98% accuracy? And even if we want to - we'd have to use already existing things (tokenizers). So what's the purpose?

## Anyway, fine-tuning transformers, an example

My colab is in full use 95% of Google allocated to me time, I can't afford such a journey 

In [None]:
from datasets import load_dataset
imdb = load_dataset("imdb")

In [None]:
small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(3000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(300))])

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)


tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [None]:
import numpy as np
from datasets import load_metric


def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(
       predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import TrainingArguments, Trainer

repo_name = "very-good-fine-tuned-model"

training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()

In [None]:
from transformers import pipeline
 
sentiment_model = pipeline(model="empyempt/very-good-fine-tuned-model")
sentiment_model(["I love this move", "This movie sucks!"])

Ye, that's roughly it