# Setup
---

In [1]:
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from stop_words import get_stop_words
import shap
import warnings

warnings.filterwarnings("ignore")

In [2]:
romanian_texts = {}
moldavian_texts = {}

conn = sqlite3.connect('news.db')
c = conn.cursor()

c.execute('SELECT * FROM romania')
rows = c.fetchall()
for row in rows:
    if row[4] not in romanian_texts:
        romanian_texts[row[4]] = []
    romanian_texts[row[4]].append(row[5])
    
c.execute('SELECT * FROM moldova WHERE newspaper != "zugo"')
rows = c.fetchall()
for row in rows:
    text = ''
    if len(row[5]) > 10000:
        text = row[5][:10000]
    else:
        text = row[5]
    if row[4] not in moldavian_texts:
        moldavian_texts[row[4]] = []
        
    moldavian_texts[row[4]].append(text)

conn.close()

In [3]:
all_texts = {"romana": [], "moldova": []}

for key in romanian_texts:
    all_texts["romana"].extend(romanian_texts[key])

    
for key in moldavian_texts:
    all_texts["moldova"].extend(moldavian_texts[key])
    

In [4]:
# De aici: https://en.wiktionary.org/wiki/Category:Romanian_prefixes
romanian_prefixes = [
    # A
    "agro", "alt", "ante", "anti", "aorto", "arhi", "astro",

    # B
    "balano",

    # C
    "cardio", "carpo", "cosmo",

    # D
    "demono", "des", "dez",

    # F
    "franco",

    # G
    "gastro", "germano", "greco",

    # H
    "hecto", "hiper",

    # I
    "în",

    # K
    "kilo",

    # L
    "lexico",

    # M
    "mili", "muzico",

    # N
    "nano", "ne",

    # O
    "ori", "ornito",

    # P
    "pneumo", "pre", "prea", "proto", "pseudo", "psiho",

    # R
    "răs", "re", "rino", "ruso",

    # S
    "stră", "sub",

    # T
    "tehno", "teo", "termo",

    # V
    "vice"
]


def replace_i_prefix(word, prefixes):
  for prefix in prefixes:
    try:
      if word.lower().startswith(prefix) and len(word) > len(prefix) and word[len(prefix):][0] in ["î", "Î"]:
        first_letter = word[len(prefix):][0]
        first_letter = "i" if first_letter == "î" else ("I" if first_letter == "Î" else first_letter)
        word = prefix + first_letter + word[len(prefix) + 1:]

    except:
      print(word)
    
  word = word.replace("î", "a").replace("Î", "A")

  return word

def no_diacritics(text, prefixes):

  text = replace_i_prefix(text, prefixes)


  text = text.replace("â", "i")
  text = text.replace("Â", "I")
  text = text.replace("ș", "s")
  text = text.replace("ş", "s")
  text = text.replace("Ș", "S")
  text = text.replace("Ş", "S")
  text = text.replace("ț", "t")
  text = text.replace("ţ", "t")
  text = text.replace("Ț", "T")
  text = text.replace("Ţ", "T")

  # If î is the first letter of the word, replace it with i
  if text.startswith("î"):
    text = text.replace("î", "i")
  if text.startswith("Î"):
    text = text.replace("Î", "I")
  # If the last letter of the word is î, replace it with i
  if text.endswith("î"):
    text = text.replace("î", "i")
  if text.endswith("Î"):
    text = text.replace("Î", "I")
  # Else replace î with a
  if "î" in text:
    text = text.replace("î", "a")     
  # text = text.replace("î", "i")
  # text = text.replace("Î", "I")
  text = text.replace("ă", "a")
  text = text.replace("Ă", "A")

  return text


for key in moldavian_texts:
    for i in range(len(moldavian_texts[key])):
        moldavian_texts[key][i] = no_diacritics(moldavian_texts[key][i], romanian_prefixes)

for key in romanian_texts:
    for i in range(len(romanian_texts[key])):
        romanian_texts[key][i] = no_diacritics(romanian_texts[key][i], romanian_prefixes)

print(moldavian_texts["Sport"][0])
print(romanian_texts['Stiri'][12])

# print(no_diacritics("cîțiva", romanian_prefixes))

 Real Madrid a castigat Supercupa Spaniei, dupa ce in finala a invins-o pe Barcelona, scor 4-1.Madrilenii au avut un start de meci excelent, cu doua goluri marcate in doar trei minute de Vinicius. Brazilianul a reusit hat-trick-ul in minutul 39, dupa ce a transformat un penalty. Partida din Supercupa Spaniei este cel de-al 15-lea "El Clasico" pentru Vinicius. Brazilianul a reusit o performanta importanta pentru cariera sa. Starul lui Real Madrid este cel de-al 16-lea jucator din istorie care a marcat de trei ori intr-un "El Clasico". Pe aceasta lista se mai afla: Jaime Lazcano, Joan Ramon i Pera, Ventora, Jesus Narro, Cesar, Evaristo de Macedo, Amancio, Ferenc Puskas, Ivan Zamorano, Fernando Sanudo, Gary Lineker, Romario, Luis Suarez, Karim Benzema si Lionel Messi. Este important de precizat ca Messi este singurul fotbalist dintre cei enumerati care a reusit aceasta performanta de doua ori in cariera, potrivit Fanatik. Partida din Supercupa Spaniei este cel de-al 15-lea "El Clasico" pe

In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3060


---

# Model building
---

According to the study, we have a model with a RoBERT base generating representations for an array of tokens some of them being masked. This first step is followed by a geolocation prediction that feeds the representations into a classification head that predicts the location of the text. 

In [6]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertModel

# https://github.com/valentinhofmann/geoadaptation/blob/main/src/model_geoadaptation.py
class Classifier(nn.Module):
    def __init__(self, output_dim, config):
        super(Classifier, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, output_dim)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.gelu = nn.GELU()

    def forward(self, x):
        x = x[:, 0, :]  # Use [CLS] token's representation
        x = self.dropout(x)
        x = self.dense(x)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.out_proj(x)  # Output logits for each class (Romania, Moldova)
        return x

        

class GeoadaptedRobert(nn.Module):
    def __init__(self, head, model_name="readerbench/RoBERT-large"):
        super(GeoadaptedRobert, self).__init__()
        self.robert = AutoModel.from_pretrained(model_name)
        
        # Geolocation Head: 2 values, one for each class (Romania, Moldova)
        self.geo_head = Classifier(output_dim=2, config=self.robert.config)

        # MLM head (assuming you already have a classifier head for MLM)
        self.cls = nn.Linear(self.robert.config.hidden_size, self.robert.config.vocab_size)

        self.head = head

    def forward(self, input_ids, head, attention_mask=None, token_type_ids=None, labels=None, points=None, val=None):
        # Forward pass through the RoBERT model
        outputs = self.robert(input_ids=input_ids, 
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)
        
        # Get the hidden states from the last layer
        sequence_output = outputs.last_hidden_state

        # MLM Loss
        scores = self.cls(sequence_output)
        self.loss_fct = nn.CrossEntropyLoss()
        mlm_loss = self.loss_fct(scores.view(-1, scores.size(-1)), labels.view(-1))

        # Geolocation Classification Loss
        geo_logits = self.geo_head(sequence_output)
        if head == 'masked':
            geo_logits = geo_logits[labels != -100]
        self.geo_loss_fct = nn.CrossEntropyLoss()
        geo_loss = self.geo_loss_fct(geo_logits, points)  # 'points' should be class labels (0 for Moldova, 1 for Romania)
        preds = torch.argmax(geo_logits, dim=-1)

        return mlm_loss, geo_loss, preds

        

---

# Training

---

In [7]:
# Collator

# https://github.com/valentinhofmann/geoadaptation/blob/main/src/helpers_geoadaptation.py#L25
class MLMCollator:
    def __init__(self, tokenizer, head, probab = 0.15):
        self.tok = tokenizer
        self.mlm_probability = probab
        self.head = head

    def __call__(self, batch):
        texts = [self.tok.encode(text, padding = True, truncation = True) for text, _ in batch]
        points = torch.tensor([point for _, point in batch], dtype=torch.float32)
        batch_size = len(texts)
        max_len = max(len(text) for text in texts)
        input_ids = torch.zeros(batch_size, max_len).long()
        attention_mask = torch.zeros(batch_size, max_len).long()
        token_type_ids = torch.zeros(batch_size, max_len).long()
        for i, text in enumerate(texts):
            input_ids[i, :len(text)] = torch.tensor(text)
            attention_mask[i, :len(text)] = 1
        batch_tensors = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }

        # https://github.com/huggingface/transformers/blob/master/src/transformers/data/data_collator.py
        mlm_labels = batch_tensors['input_ids'].clone()
        probability_matrix = torch.full(mlm_labels.shape, self.mlm_probability)
        special_tokens_mask = [
            self.tok.get_special_tokens_mask(val, already_has_special_tokens=True) for val in mlm_labels.tolist()
        ]
        special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        masked_indices = torch.bernoulli(probability_matrix).bool()
        mlm_labels[~masked_indices] = -100
        indices_replaced = torch.bernoulli(torch.full(mlm_labels.shape, 0.8)).bool() & masked_indices
        batch_tensors['input_ids'][indices_replaced] = self.tok.convert_tokens_to_ids(self.tok.mask_token)
        indices_random = torch.bernoulli(torch.full(mlm_labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tok), mlm_labels.shape, dtype=torch.long)
        batch_tensors['input_ids'][indices_random] = random_words[indices_random]

        # Repeat points for masked tokens
        if self.head == 'masked':
            n_masks = masked_indices.sum(axis=-1)
            points = torch.repeat_interleave(points, n_masks, dim=0)
        # print(type(batch_tensors), type(mlm_labels), type(points))
        return batch_tensors, mlm_labels, points
        

In [8]:
from sklearn.preprocessing import StandardScaler


class data:
    def __init__(self, texts, points):
        self.texts = texts
        self.points = points

moldavian_texts = [text for text in all_texts["moldova"]]
moldavian_points = np.zeros((len(moldavian_texts), 2))
moldavian_points[:, 0] = 1

romanian_texts = [text for text in all_texts["romana"]]
romanian_points = np.zeros((len(romanian_texts), 2))
romanian_points[:, 1] = 1

class DatasetForMaskedLM:
    def __init__(self, data, scaler = None):
        self.texts = data.texts
        if scaler is None:
            self.scaler = StandardScaler()
            self.points = self.scaler.fit_transform(data.points)
        else:
            self.scaler = scaler
            self.points = self.scaler.transform(data.points)

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.points[idx]

# Slect random texts for training and testing
moldavian_texts_train, moldavian_texts_test, moldavian_points_train, moldavian_points_test = train_test_split(moldavian_texts, moldavian_points, test_size=0.2, random_state=42)
romanian_texts_train, romanian_texts_test, romanian_points_train, romanian_points_test = train_test_split(romanian_texts, romanian_points, test_size=0.2, random_state=42)

moldavian_texts_val = moldavian_texts_train[:len(moldavian_texts_train) // 10]
moldavian_points_val = moldavian_points_train[:len(moldavian_texts_train) // 10]

romanian_texts_val = romanian_texts_train[:len(romanian_texts_train) // 10]
romanian_points_val = romanian_points_train[:len(romanian_texts_train) // 10]

# Select a random text and its corresponding label
moldavian_text, moldavian_point = moldavian_texts_train[np.random.randint(len(moldavian_texts_train))], moldavian_points_train[np.random.randint(len(moldavian_points_train))]
romanian_text, romanian_point = romanian_texts_train[np.random.randint(len(romanian_texts_train))], romanian_points_train[np.random.randint(len(romanian_points_train))]
print(moldavian_text[:10], moldavian_point)
print(romanian_text[:10], romanian_point)

# Create the data objects
romanian_data_train = data(romanian_texts_train, romanian_points_train)
moldavian_data_train = data(moldavian_texts_train, moldavian_points_train)

romanian_data_val = data(romanian_texts_val, romanian_points_val)
moldavian_data_val = data(moldavian_texts_val, moldavian_points_val)

romanian_data_test = data(romanian_texts_test, romanian_points_test)
moldavian_data_test = data(moldavian_texts_test, moldavian_points_test)

# Create the datasets
romanian_dataset_train = DatasetForMaskedLM(romanian_data_train)
moldavian_dataset_train = DatasetForMaskedLM(moldavian_data_train)

romanian_dataset_val = DatasetForMaskedLM(romanian_data_val, romanian_dataset_train.scaler)
moldavian_dataset_val = DatasetForMaskedLM(moldavian_data_val, romanian_dataset_train.scaler)

romanian_dataset_test = DatasetForMaskedLM(romanian_data_test, romanian_dataset_train.scaler)
moldavian_dataset_test = DatasetForMaskedLM(moldavian_data_test, romanian_dataset_train.scaler)

print(len(romanian_texts_train), len(romanian_texts_val), len(romanian_texts_test))
print(len(romanian_points_train), len(romanian_points_val), len(romanian_points_test))

print(len(moldavian_texts_train), len(moldavian_texts_val), len(moldavian_texts_test))  
print(len(moldavian_points_train), len(moldavian_points_val), len(moldavian_points_test))



 Procuratu [1. 0.]
 Meciul de [0. 1.]
814 81 204
814 81 204
2520 252 631
2520 252 631


In [9]:
from torch.utils.data import DataLoader
# Import DatasetForMaskedLM



tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-large")

collator = MLMCollator(tokenizer, head='masked', probab=0.15)

train_loader = DataLoader(romanian_dataset_train, batch_size=32, shuffle=True, collate_fn=collator)
val_loader = DataLoader(romanian_dataset_val, batch_size=32, shuffle=False, collate_fn=collator)
test_loader = DataLoader(romanian_dataset_test, batch_size=32, shuffle=False, collate_fn=collator)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GeoadaptedRobert(head='masked', model_name="readerbench/RoBERT-large").to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
warmup_steps = 3 * len(train_loader)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: min(step / warmup_steps, 1.0))

In [10]:
# Get the first batch of the train loader
batch = next(iter(train_loader))
input_ids, mlm_labels, points = batch


# Print the shape of the input_ids tensor
print(input_ids['input_ids'].shape)
print(mlm_labels.shape)
print(points.shape)

torch.Size([32, 512])
torch.Size([32, 512])
torch.Size([2275, 2])


In [11]:
import torch
torch.cuda.empty_cache()


In [16]:
import random

mtl = False

random.seed(123)
np.random.seed(123)
torch.manual_seed(123)
for epoch in range(1, 10 + 1):
    print('Train model...')
    model.train()
    for i, (batch_tensors, mlm_labels, points) in enumerate(train_loader):
        input_ids = batch_tensors['input_ids'].to(device)
        attention_mask = batch_tensors['attention_mask'].to(device)
        token_type_ids = batch_tensors['token_type_ids'].to(device)
        mlm_labels = mlm_labels.to(device)
        # print(input_ids.shape)
        # print(attention_mask.shape)
        # print(token_type_ids.shape)
        # print(mlm_labels.shape)
        vocab_size = tokenizer.vocab_size
        if torch.max(input_ids) >= vocab_size:
            print("Error: input_ids contains out-of-bounds token index.")
        if torch.max(mlm_labels) >= vocab_size:
            print("Error: mlm_labels contains out-of-bounds token index.")
        print("Input IDs shape:", input_ids.shape)
        print("Attention mask shape:", attention_mask.shape)
        print("Token type IDs shape:", token_type_ids.shape)
        print("MLM labels shape:", mlm_labels.shape)


        break

        if i == 0:
            print(input_ids[0, :])
        points = points.to(device)
        optimizer.zero_grad()
        mlm_loss, geo_loss, preds = model(
            input_ids,
            attention_mask,
            token_type_ids,
            mlm_labels,
            points,
            False
        )
        if mtl:
            loss = mlm_loss + geo_loss
            loss.backward()
        else:
            mlm_loss.backward()
        optimizer.step()
        scheduler.step()


Train model...
Input IDs shape: torch.Size([32, 512])
Attention mask shape: torch.Size([32, 512])
Token type IDs shape: torch.Size([32, 512])
MLM labels shape: torch.Size([32, 512])
Train model...
Input IDs shape: torch.Size([32, 512])
Attention mask shape: torch.Size([32, 512])
Token type IDs shape: torch.Size([32, 512])
MLM labels shape: torch.Size([32, 512])
Train model...
Input IDs shape: torch.Size([32, 512])
Attention mask shape: torch.Size([32, 512])
Token type IDs shape: torch.Size([32, 512])
MLM labels shape: torch.Size([32, 512])
Train model...
Input IDs shape: torch.Size([32, 512])
Attention mask shape: torch.Size([32, 512])
Token type IDs shape: torch.Size([32, 512])
MLM labels shape: torch.Size([32, 512])
Train model...
Input IDs shape: torch.Size([32, 512])
Attention mask shape: torch.Size([32, 512])
Token type IDs shape: torch.Size([32, 512])
MLM labels shape: torch.Size([32, 512])
Train model...
Input IDs shape: torch.Size([32, 512])
Attention mask shape: torch.Size([32