# Imports

In [None]:
import numpy as np
from datasets import load_dataset
from huggingface_hub import PyTorchModelHubMixin
from sentence_transformers import SentenceTransformer

import json
import matplotlib.pyplot as plt


import torch
import sklearn.metrics as metrics
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForPreTraining, BertModel, AutoTokenizer, AutoModel, ModernBertForSequenceClassification, BertForSequenceClassification, RobertaForSequenceClassification
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.linear_model import SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis



from torch.optim import AdamW
import copy


from tqdm import tqdm, trange

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Parameteres

In [None]:
# AVAILABLE MODELS
# sk-rf: Sbert + sklearn SVM
# mlp: Sbert + MLP
# ct: Covid-twitter-BERT
# modern-base: Modern-BERT-base
# modern-large: Modern-BERT-large
# gte-base: gte-base-en-v1.5
# gte-large: gte-large-en-v1.5

MODEL = "modern-base"
FINAL_SUBMISSION = False # Train on full data if True, Tarin on 80% if False

# Data

In [None]:
dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")

In [None]:
LABELS = [
    '0_not_relevant',
    '1_not_happening'
    '2_not_human',
    '3_not_bad',
    '4_solutions_harmful_unnecessary',
    '5_science_unreliable',
    '6_proponents_biased',
    '7_fossil_fuels_needed'
]

In [None]:
data_train = dataset['train']
data_test = dataset['test']

In [None]:
train_texts = [t['quote'] for t in data_train]
test_texts = [t['quote'] for t in data_test]



labels_train = [int(t['label'][0]) for t in data_train]
labels_test = [int(t['label'][0]) for t in data_test]

if FINAL_SUBMISSION:
    train_texts = train_texts+test_texts
    labels_train = labels_train+labels_test


weights_tmp = []
for i in range(0, 8):
    weights_tmp.append(labels_train.count(i))

weights = [len(labels_train)/(w+1) for w in weights_tmp]

weights = torch.FloatTensor(weights).to(device)
weights

In [None]:
if MODEL in ['sk-rf']:
    emb_model = SentenceTransformer("sentence-transformers/sentence-t5-large")
    batch_size = 512

    train_texts = torch.Tensor(emb_model.encode(train_texts))
    train_labels = labels_train
    train_labels = torch.tensor(train_labels)
    train_data = TensorDataset(train_texts, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    test_texts = torch.Tensor(emb_model.encode(test_texts))
    test_labels = labels_test
    test_labels = torch.tensor(test_labels)
    test_data = TensorDataset(test_texts, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
if MODEL in ['mlp']:
    emb_model = SentenceTransformer("sentence-transformers/sentence-t5-large")
    batch_size = 2

    train_texts = torch.Tensor(emb_model.encode(train_texts))
    train_labels = labels_train
    train_labels = torch.tensor(train_labels)
    train_data = TensorDataset(train_texts, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    test_texts = torch.Tensor(emb_model.encode(test_texts))
    test_labels = labels_test
    test_labels = torch.tensor(test_labels)
    test_data = TensorDataset(test_texts, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

elif MODEL=="ct":
    tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')
    MAX_LEN = 256
    
    tokenized_input = tokenizer(train_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
    tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
    
    train_input_ids, train_token_type_ids, train_attention_mask = tokenized_input['input_ids'], tokenized_input['token_type_ids'], tokenized_input['attention_mask']
    test_input_ids, test_token_type_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['token_type_ids'], tokenized_test['attention_mask']
    
    train_token_type_ids = torch.tensor(train_token_type_ids)
    test_token_type_ids = torch.tensor(test_token_type_ids)
    
    
    train_labels = labels_train
    test_labels = labels_test
    
    
    # Convert to torch tensor
    train_input_ids = torch.tensor(train_input_ids)
    train_labels = torch.tensor(train_labels)
    train_attention_mask = torch.tensor(train_attention_mask)
    
    test_input_ids = torch.tensor(test_input_ids)
    test_labels = torch.tensor(test_labels)
    test_attention_mask = torch.tensor(test_attention_mask)
    batch_size = 16

    train_data = TensorDataset(train_input_ids, train_attention_mask, train_token_type_ids, train_labels)
    test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids, test_labels)
    
    
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
elif MODEL=="gte-large" or MODEL=="gte-base":
    MAX_LEN = 256
    tokenizer = tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/'+MODEL+'-en-v1.5')
    tokenized_input = tokenizer(train_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
    tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
    
    train_input_ids, train_attention_mask, train_token_type_ids = tokenized_input['input_ids'], tokenized_input['attention_mask'], tokenized_input['token_type_ids']
    test_input_ids, test_attention_mask, test_token_type_ids = tokenized_test['input_ids'], tokenized_test['attention_mask'], tokenized_test['token_type_ids']
    
    train_labels = labels_train
    test_labels = labels_test
    
    # Convert to torch tensor
    train_input_ids = torch.tensor(train_input_ids)
    train_labels = torch.tensor(train_labels)
    train_attention_mask = torch.tensor(train_attention_mask)
    train_token_type_ids = torch.tensor(train_token_type_ids)
    
    test_input_ids = torch.tensor(test_input_ids)
    test_labels = torch.tensor(test_labels)
    test_attention_mask = torch.tensor(test_attention_mask)
    test_token_type_ids = torch.tensor(test_token_type_ids)

    batch_size = 16

    train_data = TensorDataset(train_input_ids, train_attention_mask, train_token_type_ids, train_labels)
    test_data = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids, test_labels)
    
    
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

elif MODEL == "modern-large" or MODEL=="modern-base":
    if MODEL == "modern-large":
        tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-large")
    else:
        tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
        
    MAX_LEN = 256

    tokenized_input = tokenizer(train_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
    tokenized_test = tokenizer(test_texts, max_length=MAX_LEN, padding='max_length', truncation=True)
    
    train_input_ids, train_attention_mask = tokenized_input['input_ids'], tokenized_input['attention_mask']
    test_input_ids, test_attention_mask = tokenized_test['input_ids'], tokenized_test['attention_mask']    
    
    train_labels = labels_train
    test_labels = labels_test
    
    
    # Convert to torch tensor
    train_input_ids = torch.tensor(train_input_ids)
    train_labels = torch.tensor(train_labels)
    train_attention_mask = torch.tensor(train_attention_mask)
    
    test_input_ids = torch.tensor(test_input_ids)
    test_labels = torch.tensor(test_labels)
    test_attention_mask = torch.tensor(test_attention_mask)

    batch_size = 16
    train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)
    test_data = TensorDataset(test_input_ids, test_attention_mask, test_labels)
    
    
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


# Models

In [None]:
class CTBERT(
    nn.Module,
    PyTorchModelHubMixin, 
    # optionally, you can add metadata which gets pushed to the model card
):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertForPreTraining.from_pretrained('digitalepidemiologylab/covid-twitter-bert-v2')    
        self.bert.cls.seq_relationship = nn.Linear(1024, num_classes)
        
    def forward(self, input_ids, input_mask, token_type_ids):
        outputs = self.bert(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = input_mask)
        logits = outputs[1]
        
        return logits  

In [None]:
class conspiracyModelLarge(
    nn.Module,
    PyTorchModelHubMixin, 
    # optionally, you can add metadata which gets pushed to the model card
):
    def __init__(self, num_classes):
        super().__init__()
        self.n_classes = num_classes
        self.bert = ModernBertForSequenceClassification.from_pretrained('answerdotai/ModernBERT-large', num_labels=num_classes)    
        
    def forward(self, input_ids, input_mask):
        outputs = self.bert(input_ids = input_ids, attention_mask = input_mask)
        
        return outputs.logits

class conspiracyModelBase(
    nn.Module,
    PyTorchModelHubMixin, 
    # optionally, you can add metadata which gets pushed to the model card
):
    def __init__(self, num_classes):
        super().__init__()
        self.n_classes = num_classes
        self.bert = ModernBertForSequenceClassification.from_pretrained('answerdotai/ModernBERT-base', num_labels=num_classes)    
        
    def forward(self, input_ids, input_mask):
        outputs = self.bert(input_ids = input_ids, attention_mask = input_mask)
        
        return outputs.logits

In [None]:
class gteModelLarge(
    nn.Module,
    PyTorchModelHubMixin, 
    # optionally, you can add metadata which gets pushed to the model card
):    
    def __init__(self, num_classes):
        super().__init__()
        self.n_classes = num_classes
        self.gte = AutoModel.from_pretrained('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
        self.cls = nn.Linear(1024, num_classes)
        
    def forward(self, input_ids, input_mask, input_type_ids):
        outputs = self.gte(input_ids = input_ids, attention_mask = input_mask, token_type_ids = input_type_ids)
        embeddings = outputs.last_hidden_state[:, 0]
        logits = self.cls(embeddings)
        return logits

class gteModelBase(
    nn.Module,
    PyTorchModelHubMixin, 
    # optionally, you can add metadata which gets pushed to the model card
):    
    def __init__(self, num_classes):
        super().__init__()
        self.n_classes = num_classes
        self.gte = AutoModel.from_pretrained('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
        self.cls = nn.Linear(768, num_classes)
        
    def forward(self, input_ids, input_mask, input_type_ids):
        outputs = self.gte(input_ids = input_ids, attention_mask = input_mask, token_type_ids = input_type_ids)
        embeddings = outputs.last_hidden_state[:, 0]
        logits = self.cls(embeddings)
        return logits

In [None]:
class ConspiracyClassification(
    nn.Module,
    PyTorchModelHubMixin, 
    # optionally, you can add metadata which gets pushed to the model card
):    
    def __init__(self, num_classes):
        super().__init__()
        self.h1 = nn.Linear(768, 100)
        self.h2 = nn.Linear(100, 100)
        self.h3 = nn.Linear(100, 100)
        self.h4 = nn.Linear(100, 50)
        self.h5 = nn.Linear(50, num_classes)
        self.dropout = nn.Dropout(0.1)
        self.activation = nn.ReLU()

        
    def forward(self, input_texts):
        outputs = self.h1(input_texts)
        outputs = self.activation(outputs)
        outputs = self.dropout(outputs)
        outputs = self.h2(outputs)
        outputs = self.activation(outputs)
        outputs = self.dropout(outputs)
        outputs = self.h3(outputs)
        outputs = self.activation(outputs)
        outputs = self.dropout(outputs)
        outputs = self.h4(outputs)
        outputs = self.activation(outputs)
        outputs = self.dropout(outputs)
        outputs = self.h5(outputs)
        
        return outputs

In [None]:
config = {"num_classes": 8}
if MODEL in ['sk-rf']:
    model = SGDClassifier(loss ='hinge', 
                          alpha=0.0001)
    model = SVC()
    
else:
    if MODEL =="ct":
        model = CTBERT(**config)
    elif MODEL =="gte-base":
        model = gteModelBase(**config)
    elif MODEL =="gte-large":
        model = gteModelLarge(**config)
    elif MODEL =="modern-base":
        model = conspiracyModelBase(**config)
    elif MODEL =="modern-large":
        model = conspiracyModelLarge(**config)
    elif MODEL =="mlp":
        model = ConspiracyClassification(**config)
    model.to(device)

In [None]:
if MODEL not in ['sk-rf']:
    if MODEL == "mlp":
        lr = 5e-4
    else:
        lr = 2e-5
    optimizer = AdamW(model.parameters(),
                      lr=lr,
                      weight_decay = 0.01)
    
    scheduler = ReduceLROnPlateau(optimizer, patience=4, factor=0.3)

In [None]:
criterion = nn.CrossEntropyLoss(weight = weights)    

# Train

In [None]:
epochs = 15

best_MCCA = 0
best_F1 = 0
best_loss = 999
best_ACC = 0
results = []

if MODEL not in ['sk-rf']:
    best_state_dict = model.state_dict()
else:
    best_state_dict = None
best_epoch = 0

for e in trange(0, epochs, position=0, leave=True):

    print('Starting epoch ', e)
    if MODEL not in ['sk-rf']:
        model.train()
        
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    x_features = []
    y_true = []
    for step, batch in enumerate(train_dataloader):
        if MODEL in ['sk-rf']:
            x, y = batch
            x_features.extend(x)
            y_true.extend(y)
        else:
            batch = tuple(t.to(device) for t in batch)
            if MODEL in ['ct', 'gte-base', 'gte-large']:
                b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
            elif MODEL in ['modern-base', 'modern-large']:
                b_input_ids, b_input_mask, b_labels = batch
            elif MODEL in ['mlp']:
                b_input_ids, b_labels = batch
            
            b_labels = b_labels.float()
            optimizer.zero_grad()
    
            if MODEL in ['ct', 'gte-base', 'gte-large']:
                logits = model(b_input_ids, b_input_mask, b_token_type_ids)
            elif MODEL in ['modern-base', 'modern-large']:
                logits = model(b_input_ids, b_input_mask)
            elif MODEL in ['mlp']:
                logits = model(b_input_ids)
        
                
            loss = criterion(logits, b_labels.long())
            loss.backward()
            optimizer.step()
    
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
    
    if MODEL in ['sk-rf']:
        model.fit(x_features, y_true)
    else:
        print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
    if MODEL not in ['sk-rf']:
        model.eval()
    
    predictions_sep = []
    
    labels_sep = []
    
    eval_loss = 0
    steps=0
    x_features = []
    y_true = []
    for step, batch in enumerate(test_dataloader):
        if MODEL in ['sk-rf']:
            x, y = batch
            x_features.extend(x)
            y_true.extend(y)
        else:
            batch = tuple(t.to(device) for t in batch)
    
            if MODEL in ['ct', 'gte-base', 'gte-large']:
                b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch
            elif MODEL in ['modern-base', 'modern-large']:
                b_input_ids, b_input_mask, b_labels = batch
            elif MODEL in ['mlp', 'sk-rf']:
                b_input_ids, b_labels = batch
                
            b_labels = b_labels.float()
            
            with torch.no_grad():
    
                if MODEL in ['ct', 'gte-base', 'gte-large']:
                    logits = model(b_input_ids, b_input_mask, b_token_type_ids)
                elif MODEL in ['modern-base', 'modern-large']:
                    logits = model(b_input_ids, b_input_mask)

                elif MODEL in ['mlp']:
                    logits = model(b_input_ids)

                loss = criterion(logits, b_labels.long())
        
    
    
            logits = logits.detach().cpu().numpy()
            ground_truth = b_labels.detach().cpu().numpy()
            
            steps+=1
            eval_loss+=loss.detach().item()
            predictions_sep.extend(logits.argmax(1))
            for l in ground_truth:
                labels_sep.append(l)
        
    if MODEL in ['sk-rf']:
        LOSS = 999
        predictions_sep = model.predict(x_features).tolist()
        labels_sep = y_true
    else:
        scheduler.step(eval_loss/steps)
        LOSS = eval_loss/steps
    
    ACC = metrics.accuracy_score(labels_sep, predictions_sep)
    F1 = metrics.f1_score(labels_sep, predictions_sep, average='macro')
    MCCA = metrics.matthews_corrcoef(labels_sep, predictions_sep)
    
    if ACC> best_ACC:
        best_MCCA = MCCA
        best_ACC = ACC
        best_F1 = F1
        best_loss = LOSS
        if MODEL not in ['sk-rf']:
            best_state_dict = copy.deepcopy(model.state_dict())
        else:
            best_state_dict = None
        best_epoch = e
    results.append([LOSS, ACC, F1, MCCA])
    print("\t Eval loss: {}".format(LOSS))
    print("\t Eval ACC: {}".format(ACC))
    print("\t Eval F1: {}".format(F1))
    print("\t Eval MCCA: {}".format(MCCA))
    print("---"*25)
    print("\n")

In [None]:
if MODEL != "sk-rf":
    model.load_state_dict(best_state_dict)

In [None]:
from huggingface_hub import login

HF_token = "<YOUR_TOKEN>"
login(HF_token)

In [None]:
model.save_pretrained("./trained_model")