In [None]:
!pip install transformers[sentencepiece]
!pip install tensorflow_addons

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.optimizers import Adam
from keras.metrics import binary_accuracy
import tensorflow as tf
import re
import tensorflow_addons as tfa
import numpy as np

In [3]:
tf.random.set_seed(1234)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def extract_labels(df_labels):
    # extracting labels to fit the multilabel encoder, 
    # to convert our numerical predictions to their original label value
    labels = []
    for index, row in df_labels.iterrows():
        values = []
        for k, v in row.items():
            if v == 1:
                values.append(k)
        labels.append(values)
    return labels

In [5]:
def predict(lm, model, text, labels):
    print('Predicting on {} sentences:'.format(len(text)))
    # Selecting the correct tokenizer for the model.
    tokenizer = AutoTokenizer.from_pretrained(lm, use_fast=False)
    tok_text = tokenizer(text, padding=True, truncation=True, return_tensors="np").data

    # Encoding the labels with sklearns MultiLabelBinarizer 
    encoder = MultiLabelBinarizer()
    encoder.fit(labels)

    # Get predictions and convert to numerical
    pred = model.predict(tok_text)["logits"]
    prob = tf.nn.sigmoid(pred)

    # multi-label predictions
    multi_pred = np.zeros(prob.shape)
    multi_pred[np.where(prob >= 0.5)] = 1

    # top prob label prediction
    single_pred = np.zeros(prob.shape)
    for p, max_prob in zip(single_pred, np.argmax(prob, axis=1)):
        p[max_prob] = 1

    # convert numerical labels to original values
    single_predictions = encoder.inverse_transform(single_pred)
    multi_predictions = encoder.inverse_transform(multi_pred)
    return multi_predictions, single_predictions

## For splitting comments into paragraphs

In [35]:
df_task2 = pd.read_csv('top100threads.csv')
# removing links, removing double whitespace and splitting comments on double whitespace (\n\n)
df_task2['comment_text'] = df_task2['comment_text'].apply(lambda x: re.sub(r'http\S+', '', str(x).strip())).apply(lambda x: x.split('\n\n'))
# converting comments into seperate rows, with same comment_id
df_task2 = df_task2.explode('comment_text')
# adding new comment_id index named sub_comment_id
df_task2['sub_comment_id'] = df_task2.groupby('comment_id').cumcount().astype(str)
df_task2['sub_comment_id'] = df_task2['comment_id'].astype(str) + '_' + df_task2['sub_comment_id']
# adding column with comment text length splitted on word level
df_task2['comment_length'] = df_task2['comment_text'].apply(lambda x: len(x.split()))
# update column positions
col_pos = ["thread_id","thread_title", "comment_id", "sub_comment_id","comment_text", "comment_length", "total_upvotes", "total_downvotes", "total_score", "author"]
df_task2 = df_task2.reindex(columns=col_pos)

## For using comments without splitting (original)

In [16]:
df_task2 = pd.read_csv('top100threads.csv')
# removing links, removing double whitespace and splitting comments on double whitespace (\n\n)
df_task2['comment_text'] = df_task2['comment_text'].apply(lambda x: re.sub(r'http\S+', '', str(x).strip()))
df_task2['comment_text'] = df_task2['comment_text'].apply(lambda x: re.sub('\n', ' ', x))
# adding column with comment text length splitted on word level
df_task2['comment_length'] = df_task2['comment_text'].apply(lambda x: len(x.split()))
# update column positions
col_pos = ["thread_id","thread_title", "comment_id","comment_text", "comment_length", "total_upvotes", "total_downvotes", "total_score", "author"]
df_task2 = df_task2.reindex(columns=col_pos)

## Predicting DeBERTa-large on task 2 comments

In [36]:
task2_comments = list(df_task2['comment_text'])

In [37]:
labels = pd.read_table('/content/drive/MyDrive/LTP_data/labels-training.tsv')
labels = extract_labels(labels)

In [38]:
DEBERTA = TFAutoModelForSequenceClassification.from_pretrained('microsoft/deberta-large',
                                                                num_labels=20,
                                                                problem_type="multi_label_classification")
DEBERTA.load_weights('/content/drive/MyDrive/LTP_data/LTP_DEBERTA_weights.h5')

All model checkpoint layers were used when initializing TFDebertaForSequenceClassification.

Some layers of TFDebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-large and are newly initialized: ['classifier', 'pooler', 'cls_dropout']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
multi_pred, top_prob_pred = predict('microsoft/deberta-large', DEBERTA, task2_comments, labels)

In [43]:
df_task2['label_prediction'] = multi_pred
df_task2['highest_prob_label'] = top_prob_pred

In [44]:
df_task2.to_csv('DEBERTA_results.csv')

In [None]:
df_task2