# Setup
---

In [1]:
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from stop_words import get_stop_words
import shap
import warnings

warnings.filterwarnings("ignore")

In [2]:
romanian_texts = {}
moldavian_texts = {}

conn = sqlite3.connect('news.db')
c = conn.cursor()

c.execute('SELECT * FROM romania')
rows = c.fetchall()
for row in rows:
    if row[4] not in romanian_texts:
        romanian_texts[row[4]] = []
    romanian_texts[row[4]].append(row[5])
    
c.execute('SELECT * FROM moldova WHERE newspaper != "zugo"')
rows = c.fetchall()
for row in rows:
    text = ''
    if len(row[5]) > 10000:
        text = row[5][:10000]
    else:
        text = row[5]
    if row[4] not in moldavian_texts:
        moldavian_texts[row[4]] = []
        
    moldavian_texts[row[4]].append(text)

conn.close()

In [3]:
all_texts = {"romana": [], "moldova": []}

for key in romanian_texts:
    all_texts["romana"].extend(romanian_texts[key])

    
for key in moldavian_texts:
    all_texts["moldova"].extend(moldavian_texts[key])
    

In [7]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3060


---

# Model building
---

According to the study, we have a model with a RoBERT base generating representations for an array of tokens some of them being masked. This first step is followed by a geolocation prediction that feeds the representations into a classification head that predicts the location of the text. 

In [9]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertModel

# https://github.com/valentinhofmann/geoadaptation/blob/main/src/model_geoadaptation.py
class Classifier(nn.Module):
    def __init__(self, head, output_dim, config):
        super(Classifier, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.out_proj = nn.Linear(config.hidden_size, output_dim)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.gelu = nn.GELU()

    def forward(self, x):
        # x = x[:, 0, :] # Use [CLS] token's representation
        x = self.dropout(x)
        x = self.dense(x)
        x = self.gelu(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

        

class GeoadaptedRobert(nn.Module):
    def __init__(self, model_name="readerbench/RoBERT-large"):
        super(GeoadaptedRobert, self).__init__()
        self.robert = AutoModel.from_pretrained(model_name, num_labels=2)

        # Geolocation Head: 1 value, the probability of the text being from Moldova (0) or Romania (1)
        self.geo_head = Classifier(1, self.robert.config)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        # Forward pass through the RoBERT model
        outputs = self.robert(input_ids=input_ids, 
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids)
        
        # Get the hidden states from the last layer
        sequence_output = outputs.last_hidden_state

        # MLM Loss
        scores = self.cls(sequence_output)
        self.loss_fct = nn.CrossEntropyLoss()
        mlm_loss = self.loss_fct(scores.view(-1, scores.size(-1)), labels.view(-1))

        # Geolocation Loss
        geo_logits = self.geo_head(sequence_output)
        self.geo_loss_fct = nn.L1Loss()
        geo_loss = self.geo_loss_fct(geo_logits.view(-1), labels.view(-1))
        preds = geo_logits.detach().cpu().tolist()




        
        return mlm_loss, geo_loss, preds
        

---

# Training

---

In [None]:
tokenizer = AutoTokenizer.from_pretrained("readerbench/RoBERT-large")

colator 