In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import torch
import json
import os
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [None]:
import torch
from torch.utils.data import Dataset

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        # Ensure texts are strings
        texts = [str(text) for text in texts]
        
        # Tokenize inputs
        self.encodings = tokenizer(
            texts, 
            truncation=True, 
            padding=True, 
            max_length=max_length, 
            return_tensors='pt'
        )
        
        # Encode labels
        self.labels = torch.tensor(labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
    def __len__(self):
        return len(self.labels)
        

In [None]:
def get_tokens(text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokens = tokenizer.tokenize(text)
    return tokens

In [None]:
train_data = pd.read_csv('/kaggle/input/media-bias/final_labels_SG2.csv',sep=';')

In [None]:
print(train_data.info())

In [None]:
columns = ['text', 'type']
train_data = train_data[columns]
train_data = train_data.dropna()
print(train_data.info())

In [None]:
data = train_data

In [None]:
    
X = data['text'].tolist()
y = data['type'].tolist()
X_train = X
y_train = y


In [None]:
test_dataset = pd.read_csv('/kaggle/input/media-bias/final_labels_SG1.csv',sep=';')

In [None]:
columns = ['text', 'type']
test_dataset= test_dataset[columns]
test_dataset = test_dataset.dropna()
print(test_dataset.info())

In [None]:

X_test = test_dataset['text'].tolist()
y_test = test_dataset['type'].tolist()

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
# Store the label mapping for later use during inference
label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = len(label_encoder.classes_)

train_dataset = TextDataset(X_train, y_train_encoded, tokenizer)
test_dataset = TextDataset(X_test, y_test_encoded, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=50, shuffle=False)


In [None]:
model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased', 
        num_labels=num_labels
    )
if input("Load model from disk? (y/n): ").lower() == 'y':
    model_path = os.getenv('MODEL_PATH')
    model_conf = os.getenv('CONFIG_PATH')
    print(f"Loading model from path {model_path}")
    model.load_state_dict(torch.load(model_path))
    with open(model_conf, 'r') as f:
        config = json.load(f)
        num_labels = config['num_labels']
        label_mapping = config['label_mapping']
else:
    print("Training a new model")


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
    
# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-4,weight_decay=0.005)
    
# Training loop
model.train()


In [None]:
from torch.amp import GradScaler, autocast
scaler = GradScaler('cuda')

In [None]:
from tqdm import tqdm

In [None]:
patience = 10
min_delta = 0.001
best_loss = float('inf')
counter = 0
early_stop = False

In [None]:
import os
if not os.path.exists('checkpoints'):
    os.makedirs('checkpoints')

In [None]:

for epoch in range(30):
    model.train()
    print(f"Epoch {epoch + 1}")

    running_loss = 0
    num_batches = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = torch.nn.functional.one_hot(batch['labels'].long(), num_classes=3).float().to(device)
        
        outputs = model(
            input_ids, 
            attention_mask=attention_mask, 
            labels=labels
        )
        
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        running_loss += loss
        num_batches += 1
        
    print(f"Average loss: {running_loss/num_batches} across {num_batches} batches")

    model.eval()
    test_loss = 0.0
    test_batches = 0

    with torch.no_grad():
            for test_batch in test_loader:
                input_ids = test_batch['input_ids'].to(device)
                attention_mask = test_batch['attention_mask'].to(device)
                labels = torch.nn.functional.one_hot(test_batch['labels'].long(), num_classes=3).float().to(device)
                
                outputs = model(
                    input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                test_loss += outputs.loss.item()
                test_batches += 1
        
    avg_test_loss = test_loss / test_batches
    print(f"Test Loss: {avg_test_loss:.4f}")
    
    # Early stopping logic
    if avg_test_loss < best_loss - min_delta:
        # There is an improvement
        best_loss = avg_test_loss
        counter = 0
        
        # Save the best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_loss,
        }, f'checkpoints/best_model.pt')
        print(f"Model improved! Saved checkpoint at epoch {epoch + 1}")
    else:
        # No improvement
        counter += 1
        print(f"No improvement for {counter} epochs")
        
        if counter >= patience:
            early_stop = True
            print(f"No improvement after {patience} epochs. Stopping training.")
            break

In [None]:
if os.path.exists('checkpoints/best_model.pt'):
    checkpoint = torch.load('checkpoints/best_model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded best model from epoch {checkpoint['epoch'] + 1} with loss {checkpoint['loss']:.4f}")

In [None]:
model.eval()
all_preds = []
all_labels = []  
    

In [None]:
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids, 
            attention_mask=attention_mask
        )
        
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


In [None]:
# Calculate F1 score
f1 = f1_score(all_labels, all_preds, average='weighted')
print(f"F1 score: {f1}")
accuracy = sum([1 for i, j in zip(all_labels, all_preds) if i == j]) / len(all_labels)
print(f"Accuracy: {accuracy}")

# Save additional model metadata in a config file
config = {
    'num_labels': num_labels,
    'label_mapping': label_mapping,
    'f1_score': float(f1)
}


In [None]:
if model is not None:
    # Save with a fixed name instead of using F1 score in the filename
    model_path = f'/kaggle/working/model_{float(config["f1_score"]):.4f}.pt'
    config_path = f'/kaggle/working/model_config_{float(config["f1_score"]):.4f}.json'
    
    # Save the model
    torch.save(model.state_dict(), model_path)
    
    # Save the config with label mapping
    with open(config_path, 'w') as f:
        json.dump(config, f)
        
    print(f"Model saved to {model_path}")
    print(f"Model config saved to {config_path}")
else:
    print("No model to save")