# Dependencies

In [4]:
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
import json
import torch

In [5]:
# Read paragraphs
with open("./pan24-multi-author-analysis/medium/train/problem-21.txt") as f:
    text = f.read()
    # print(text)
paragraphs = text.strip().split("\n")
print("PARAGRAPHS")
print(f"Length: {len(paragraphs)}")
print(paragraphs)

PARAGRAPHS
Length: 8
["In general, be courteous to others. Debate/discuss/argue the merits of ideas, don't attack people. Personal insults, shill or troll accusations, hate speech, any suggestion or support of harm, violence, or death, and other rule violations can result in a permanent ban.", 'For those who have questions regarding any media outlets being posted on this subreddit, please click to review our details as to our approved domains list and outlet criteria.', 'I see no reason to hide behind euphemism: if you are LGBT+, conservatives want you dead, and laws like these are their way to push the boundaries to a point where they can do that legally and openly. They are a direct threat to your safety and well-being.', 'I am a bot, and this action was performed automatically. Please if you have any questions or concerns.', "This is what they've been brewing for a while. Notice how so many rank and file conservatives are all about stopping child grooming now and calling people pedo

In [6]:
with open ("./pan24-multi-author-analysis/medium/train/truth-problem-21.json") as f:
    content = f.read()
    paragraph_labels = json.loads(content)
style_changes = paragraph_labels.get("changes")
print("STYLE CHANGES")
print(f"Length: {len(style_changes)}")
print(style_changes)

STYLE CHANGES
Length: 7
[0, 1, 1, 1, 1, 1, 1]


# Load model

In [7]:
# Roberta configuration
roberta_configuration = RobertaConfig()
# Model initialization
roberta_model = RobertaModel(config=roberta_configuration)
# Accesing model configuration
roberta_model.config

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50265
}

In [8]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenizer("Hello world")["input_ids"]

[0, 31414, 232, 2]

In [9]:
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion", problem_type="multi_label_classification")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]

# To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
num_labels = len(model.config.id2label)
model = RobertaForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-emotion", num_labels=num_labels, problem_type="multi_label_classification"
)

labels = torch.sum(
    torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss

In [14]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [15]:
def get_paragraph_embedding(paragraph):
    encoded_input = tokenizer(paragraph, return_tensors="pt")

    with torch.no_grad():
        output = model(**encoded_input)
        sentence_embeddings = output.pooler_output
    
    paragraph_embedding = torch.mean(sentence_embeddings, dim=0)
    return paragraph_embedding

In [20]:
for i, paragraph in enumerate(paragraphs):
    paragraph_embedding = get_paragraph_embedding(paragraph)
    print(paragraph_embedding)
    print(f"Paragraph {i+1}, Style Change: {style_changes[i-1]}")

tensor([-0.8705, -0.4509, -0.9878,  0.8404,  0.9281, -0.2862,  0.6087,  0.2783,
        -0.9457, -1.0000, -0.7742,  0.9709,  0.9538,  0.7916,  0.6851, -0.7491,
        -0.5111, -0.6101,  0.3821,  0.3788,  0.4332,  1.0000, -0.5621,  0.4207,
         0.5837,  0.9932, -0.7330,  0.8175,  0.8789,  0.6845, -0.5731,  0.4676,
        -0.9828, -0.3146, -0.9877, -0.9848,  0.4979, -0.4029,  0.0503,  0.0470,
        -0.8252,  0.5351,  1.0000, -0.2317,  0.6106, -0.4730, -1.0000,  0.4301,
        -0.7413,  0.9808,  0.9570,  0.9815,  0.2474,  0.4899,  0.6483, -0.6484,
         0.1313,  0.2814, -0.3084, -0.6855, -0.6864,  0.4652, -0.9503, -0.8413,
         0.9842,  0.9516, -0.5024, -0.4644, -0.2749,  0.1477,  0.8046,  0.2639,
        -0.4755, -0.7244,  0.9141,  0.3947, -0.7196,  1.0000, -0.5519, -0.9345,
         0.9737,  0.9426,  0.6431, -0.7785,  0.8037, -1.0000,  0.5536, -0.3034,
        -0.9719,  0.4043,  0.6408, -0.3447,  0.9019,  0.7275, -0.7329, -0.6168,
        -0.3791, -0.9283, -0.4828, -0.49