# Predictions with vectors given by PolBert Transformer model from HuggingFace

In [None]:
import pandas as pd

In [None]:
from transformers import AutoModel, AutoTokenizer
import torch

# https://huggingface.co/dkleczek/bert-base-polish-uncased-v1

model_name = "dkleczek/bert-base-polish-cased-v1"  # polbert
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
sentences = ["To jest przykładowe zdanie.", "Za siedmioma górami, za siedmioma lasami mieszkał Tomisław Apoloniusz Curuś Bachleda Farell, jak ten piecyk z dmuchawą"]

tokenized_input = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=256)

with torch.no_grad():
    outputs = model(**tokenized_input)

sentence_embeddings = outputs.last_hidden_state.mean(dim=1)  # mean across tokens

sentence_embeddings = sentence_embeddings.numpy()

for i, sentence_embedding in enumerate(sentence_embeddings):
    print(f"Sentence {i + 1} embedding: {sentence_embedding}")

len(sentence_embeddings[0])

In [None]:
df = pd.read_csv('oversample_stemmed_train_df.csv')
df.head(3)

In [None]:
texts = df['text']
prep_texts = df['prep_text']
df = df.drop(['text', 'prep_text'], axis=1)
df

In [None]:
def convert_to_binary(df, cols):
  means = df[cols].mean(axis=1)
  pref_df = pd.DataFrame()

  for col in cols:
    pref_df[col] = df[col] >= means

  return pref_df.astype(int)

In [None]:
y_train = convert_to_binary(df, df.columns)

In [None]:
list(texts)[:3]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

X_parts = {}

for i in range(5):
  tokenized_input = tokenizer(list(texts[i*460 : (i+1)*460]), return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)

  with torch.no_grad():
      outputs = model(**tokenized_input)

  sentence_embeddings = outputs.last_hidden_state.mean(dim=1)
  sentence_embeddings = sentence_embeddings.cpu().numpy()

  X_parts['part_' + str(i)] = pd.DataFrame(sentence_embeddings)


In [None]:
X_base = pd.concat(list(X_parts.values()), ignore_index=True)

In [None]:
X_base.head(3)

In [None]:
X_base.shape

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

tf.random.set_seed(2023)


INPUT_SHAPE = X_base.shape[1]


model_base = Sequential([
    Dense(512, activation='relu', input_shape=(INPUT_SHAPE,)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(53, activation='sigmoid')
])


model_base.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

model_base.summary()

In [None]:
result = model_base.fit(X_base, y_train, epochs=50, batch_size=32, validation_split=0.15)

In [None]:
test_df = pd.read_csv('test_df.csv')

test_texts = test_df['text']
test_df = test_df.drop(['text', 'date'], axis=1)
test_df.head(3)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

tokenized_input = tokenizer(list(test_texts), return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)

with torch.no_grad():
    outputs = model(**tokenized_input)

sentence_embeddings = outputs.last_hidden_state.mean(dim=1)
sentence_embeddings = sentence_embeddings.cpu().numpy()

X_test = pd.DataFrame(sentence_embeddings)

In [None]:
X_test.head(3)

In [None]:
cols = test_df.columns

In [None]:
y_test = convert_to_binary(test_df, test_df.columns)

In [None]:
def get_attr_from_vector(vector, threshold=0.5):
  return [1 if elem >= threshold else 0 for elem in vector]

In [None]:
y_pred = model_base.predict(X_test)
y_pred_cat = pd.DataFrame([get_attr_from_vector(vector) for vector in y_pred])
y_pred_cat.columns = cols
y_pred_cat.head(3)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

total_ac = 0
total_rec = 0
total_prec = 0

size = 26

for i in range(size):
  tmp_pred = y_pred_cat.iloc[i]
  tmp_true = y_test.iloc[i]

  ac_score = accuracy_score(tmp_pred, tmp_true)
  rec_score = recall_score(tmp_pred, tmp_true)
  prec_score = precision_score(tmp_pred, tmp_true)

  total_ac += ac_score
  total_rec += rec_score
  total_prec += prec_score

  bert_bigger_nn_base_accuracy = total_ac/size
  bert_bigger_nn_base_recall = total_rec/size
  bert_bigger_nn_base_precision = total_prec/size

  print(f"{i} - Accuracy: {ac_score} | Recall: {rec_score} | Precision: {prec_score}")


print(f"\nMean - Accuracy: {total_ac/size } | Recall: {total_rec/size} | Precision: {total_prec/size}")

In [None]:
total_ac = 0

for col in cols:
  ac_score = accuracy_score(y_pred_cat[col], y_test[col])
  total_ac += ac_score
  print(f"Accuracy of predicting {col}: {ac_score}")

bert_bigger_nn_accuracy_base = total_ac/len(cols)

print(f"\nMean accuracy in test dataset: {total_ac/len(cols)}")

# Same with preprocessed texts

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

X_parts = {}

for i in range(5):
  tokenized_input = tokenizer(list(prep_texts[i*460 : (i+1)*460]), return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)

  with torch.no_grad():
      outputs = model(**tokenized_input)

  sentence_embeddings = outputs.last_hidden_state.mean(dim=1)
  sentence_embeddings = sentence_embeddings.cpu().numpy()

  X_parts['part_' + str(i)] = pd.DataFrame(sentence_embeddings)

X_prep = pd.concat(list(X_parts.values()), ignore_index=True)
X_prep.head(5)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

tf.random.set_seed(2023)

INPUT_SHAPE = X_prep.shape[1]

model_prep = Sequential([
    Dense(512, activation='relu', input_shape=(INPUT_SHAPE,)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(53, activation='sigmoid')
])


model_prep.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

model_prep.summary()

In [None]:
! pip install stop_words
! pip install pyMorfologik

In [None]:
import re
from string import punctuation
from stop_words import get_stop_words
from pyMorfologik import Morfologik
from pyMorfologik.parsing import ListParser
import string


parser = ListParser()
stemmer = Morfologik()

stopwords_pl = get_stop_words("pl")


def preprocess_text(text):
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    new_text = text.translate(translator)
    new_text = re.sub(r'\d+', '', new_text)
    new_text = re.sub(r'\s+', ' ', new_text)
    new_text = new_text.strip()
    new_text = new_text.lower()

    stems = stemmer.stem([new_text], parser)
    tokens = [(list(stems[i][1].keys())[0] if len(list(stems[i][1].keys())) > 0 else stems[i][0]) for i in range(len(stems))]

    filtered_tokens = [token for token in tokens if token not in stopwords_pl]
    filtered_tokens = [token for token in filtered_tokens if token!= '']
    processed_text = " ".join(filtered_tokens)

    return processed_text

In [None]:
test_texts_prep = [preprocess_text(text) for text in test_texts]

In [None]:
test_texts_prep[:3]

In [None]:
tokenized_input = tokenizer(test_texts_prep, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)

with torch.no_grad():
    outputs = model(**tokenized_input)

sentence_embeddings = outputs.last_hidden_state.mean(dim=1)
sentence_embeddings = sentence_embeddings.cpu().numpy()

X_test_prep = pd.DataFrame(sentence_embeddings)

In [None]:
X_test_prep.head(3)

In [None]:
result = model_prep.fit(X_prep, y_train, epochs=50, batch_size=32, validation_split=0.15)

In [None]:
y_pred = model_prep.predict(X_test_prep)
y_pred_cat = pd.DataFrame([get_attr_from_vector(vector) for vector in y_pred])
y_pred_cat.columns = cols
y_pred_cat.head(3)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

total_ac = 0
total_rec = 0
total_prec = 0

size = 26

for i in range(size):
  tmp_pred = y_pred_cat.iloc[i]
  tmp_true = y_test.iloc[i]

  ac_score = accuracy_score(tmp_pred, tmp_true)
  rec_score = recall_score(tmp_pred, tmp_true)
  prec_score = precision_score(tmp_pred, tmp_true)

  total_ac += ac_score
  total_rec += rec_score
  total_prec += prec_score

  bert_bigger_nn_prep_accuracy = total_ac/size
  bert_bigger_nn_prep_recall = total_rec/size
  bert_bigger_nn_prep_precision = total_prec/size

  print(f"{i} - Accuracy: {ac_score} | Recall: {rec_score} | Precision: {prec_score}")


print(f"\nMean - Accuracy: {total_ac/size } | Recall: {total_rec/size} | Precision: {total_prec/size}")

In [None]:
total_ac = 0

for col in cols:
  ac_score = accuracy_score(y_pred_cat[col], y_test[col])
  total_ac += ac_score
  print(f"Accuracy of predicting {col}: {ac_score}")

bert_bigger_nn_accuracy_prep = total_ac/len(cols)

print(f"\nMean accuracy in test dataset: {total_ac/len(cols)}")

In [None]:
# from first_models notebook

tested_options_acc = {'doc2vec_simple_nn_accuracy': 0.6690856313497823,
                      'tfidf_simple_nn_accuracy': 0.7097242380261248,
                      'tfidf_bigger_nn_accuracy': 0.7097242380261248,
                      'tfidf_gb_accuracy': 0.6850507982583457}

tested_options_acc['bert_bigger_nn_base_accuracy'] = bert_bigger_nn_base_accuracy
tested_options_acc['bert_bigger_nn_prep_accuracy'] = bert_bigger_nn_prep_accuracy

In [None]:
tested_options_acc

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

keys = ['doc2vec,\nmała sieć\nneuronowa', 'TF-IDF,\nmała sieć\nneuronowa', 'TF-IDF,\nwiększa sieć\nneuronowa',
        'TF-IDF,\nGradientBoosting\nwiele klasyfikatorów\n', 'Polbert,\nbez wstępnego\nprzetworzenia,\nwiększa sieć\nneuronowa',
        'Polbert,\nze wstępnym\nprzetworzeniem,\nwiększa sieć\nneuronowa']

values = [val*100 for val in list(tested_options_acc.values())]


plt.figure(figsize=(14, 6))
bars = plt.bar(keys, values, color=['gray' for _ in range(4)]+['teal' for _ in range(2)])

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Model')
plt.ylabel('Dokładność (Accuracy) na zbiorze testowym [%]')
plt.title('Dokładność (Accuracy) predykcji dokonanych za pomocą różnych modeli,\nwraz z modelem Polbert wykorzystującym architekturę transformer')

plt.tight_layout(pad=1)
plt.savefig('ml_classifier_accuracy_polbert.png')
plt.show()

In [None]:
tested_options_rec = {'doc2vec_simple_nn_recall': 0.7346125849668288,
                      'tfidf_simple_nn_recall': 0.7766475638498401,
                      'tfidf_bigger_nn_recall': 0.7787785121376258,
                      'tfidf_gb_recall': 0.7372439629118616}

tested_options_rec['bert_bigger_nn_base_recall'] = bert_bigger_nn_base_recall
tested_options_rec['bert_bigger_nn_prep_recall'] = bert_bigger_nn_prep_recall

tested_options_rec

In [None]:
values = [val*100 for val in list(tested_options_rec.values())]


plt.figure(figsize=(14, 6))
bars = plt.bar(keys, values, color=['gray' for _ in range(4)]+['teal' for _ in range(2)])

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Model')
plt.ylabel('Czułość (Recall) na zbiorze testowym [%]')
plt.title('Czułość (Recall) predykcji dokonanych za pomocą różnych modeli')

plt.tight_layout(pad=1)
plt.savefig('ml_classifier_recall_polbert.png')
plt.show()

In [None]:
tested_options_prec = {'doc2vec_simple_nn_precision': 0.5523979500769713,
                        'tfidf_simple_nn_precision': 0.6287660676456441,
                        'tfidf_bigger_nn_precision': 0.6074554151256809,
                        'tfidf_gb_precision': 0.5966024220044214}

tested_options_prec['bert_bigger_nn_base_precision'] = bert_bigger_nn_base_precision
tested_options_prec['bert_bigger_nn_prep_precision'] = bert_bigger_nn_prep_precision

tested_options_prec

In [None]:
values = [val*100 for val in list(tested_options_prec.values())]


plt.figure(figsize=(14, 6))
bars = plt.bar(keys, values, color=['gray' for _ in range(4)]+['teal' for _ in range(2)])

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.5, round(yval, 2), ha='center', va='bottom')

plt.xlabel('Model')
plt.ylabel('Precyzja (Precision) na zbiorze testowym [%]')
plt.title('Precyzja (Precision) predykcji dokonanych za pomocą różnych modeli')

plt.tight_layout(pad=1)
plt.savefig('ml_classifier_precision_polbert.png')
plt.show()