In [37]:
import pandas as pd
df = pd.read_csv('data.csv')
df

Unnamed: 0,source,domain,field,trusted
0,Nexus News,nexusnews.org,General News,True
1,Global View,globalview.com,General News,True
2,Unity Herald,unityherald.net,General News,True
3,World Scope,worldscope.info,General News,True
4,Clear Dispatch,clear-dispatch.com,General News,True
...,...,...,...,...
986,Health Scope,healthscope.com,Health and Medicine,True
987,Care Dispatch,caredispatch.org,Health and Medicine,True
988,Policy Scope,policyscope.com,Politics and International Affairs,True
989,Eco Dispatch,ecodispatch.org,Environment and Climate,True


In [None]:
df_clean = df[['domain', 'trusted']].dropna()
df_clean['label'] = df_clean['trusted'].map({True: 1, False: 0})
df_clean = df_clean[['domain', 'label']]


In [3]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_clean['domain'].tolist(),
    df_clean['label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [None]:
import torch

class DomainDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {'labels': torch.tensor(self.labels[idx])}
    
    def __len__(self):
        return len(self.labels)

train_dataset = DomainDataset(train_encodings, train_labels)
val_dataset = DomainDataset(val_encodings, val_labels)

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='results',
    evaluation_strategy='epoch',
    logging_dir='logs',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,

)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.256845
2,No log,0.218964
3,No log,0.351613
4,No log,0.347394
5,No log,0.343692
6,No log,0.349359
7,No log,0.354658
8,No log,0.358057
9,No log,0.360193
10,0.046800,0.360989


TrainOutput(global_step=500, training_loss=0.046785717010498044, metrics={'train_runtime': 42.6484, 'train_samples_per_second': 185.704, 'train_steps_per_second': 11.724, 'total_flos': 48839989651200.0, 'train_loss': 0.046785717010498044, 'epoch': 10.0})

In [None]:

# Put model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

inputs = tokenizer("britannica.com", return_tensors="pt")
inputs = {key: val.to(device) for key, val in inputs.items()}

# Run inference
with torch.no_grad():
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    is_trusted = bool(prediction)

print("Trusted domain?" , is_trusted)


Trusted domain? True
