<a href="https://colab.research.google.com/github/Feven0/AAU-CS-PROJECT-2018-2019-car-game/blob/master/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import codecs
import random
import torch
import os
from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM, RobertaTokenizer, RobertaForTokenClassification
from sklearn.preprocessing import OrdinalEncoder
from torch.utils.data import DataLoader, Dataset
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("uhhlt/am-roberta")
model = AutoModelForMaskedLM.from_pretrained("uhhlt/am-roberta")

# Read the tagged data
tagged_data_path = r'/content/tagged_modified_segments.txt'
with codecs.open(tagged_data_path, 'r', encoding='utf-8') as file:
    tagged_data = file.readlines()

# Split data into train, validation, and test sets
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

num_examples = len(tagged_data)
train_size = int(num_examples * train_ratio)
val_size = int(num_examples * val_ratio)

train_data = tagged_data[:train_size]
val_data = tagged_data[train_size:train_size + val_size]
test_data = tagged_data[train_size + val_size:]

train_file = r'/content/train.txt'
val_file = r'/content/validation.txt'
test_file = r'/content/test.txt'

with codecs.open(train_file, 'w', encoding='utf-8') as file:
    file.writelines(train_data)

with codecs.open(val_file, 'w', encoding='utf-8') as file:
    file.writelines(val_data)

with codecs.open(test_file, 'w', encoding='utf-8') as file:
    file.writelines(test_data)

# Preprocess the data
def preprocess_data(file_path, exclude_labels=None):
    texts = []
    labels = []
    unseen_labels = set()
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            if line != '\n':
                tokens = line.strip().split()
                text = tokens[0]
                if len(tokens) > 1:
                    label = tokens[1]
                else:
                    label = 'unknown'  # Use a default label if no label is provided
                if exclude_labels is None or label not in exclude_labels:
                    texts.append(text)
                    labels.append(label)
                else:
                    unseen_labels.add(label)
    return texts, labels, unseen_labels

train_texts, train_labels, train_unseen_labels = preprocess_data(train_file)
val_texts, val_labels, _ = preprocess_data(val_file, exclude_labels=train_unseen_labels)
test_texts, test_labels, _ = preprocess_data(test_file, exclude_labels=train_unseen_labels)

# Encode the labels
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

train_labels_reshaped = [[label] for label in train_labels]
train_labels_encoded = oe.fit_transform(train_labels_reshaped)

val_labels_reshaped = [[label] for label in val_labels]
test_labels_reshaped = [[label] for label in test_labels]
val_labels_encoded = oe.transform(val_labels_reshaped)
test_labels_encoded = oe.transform(test_labels_reshaped)

test_labels_encoded[test_labels_encoded == oe.categories_[0].size] = -1  # Replace unknown categories with -1

train_labels = torch.tensor(train_labels_encoded, dtype=torch.long)
val_labels = torch.tensor(val_labels_encoded, dtype=torch.long)
test_labels = torch.tensor(test_labels_encoded, dtype=torch.long)

# Tokenize the texts
train_encoded_inputs = tokenizer(train_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
val_encoded_inputs = tokenizer(val_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
test_encoded_inputs = tokenizer(test_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')

train_input_ids = train_encoded_inputs['input_ids']
train_attention_mask = train_encoded_inputs['attention_mask']
val_input_ids = val_encoded_inputs['input_ids']
val_attention_mask = val_encoded_inputs['attention_mask']
test_input_ids = test_encoded_inputs['input_ids']
test_attention_mask = test_encoded_inputs['attention_mask']

# TensorFlow setup
os.environ['XLA_PYTHON_CLIENT_NUMA_AWARE'] = '1'
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = '0'

# Prepare datasets for DataLoader
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.label_to_idx = {label: idx for idx, label in enumerate(set(list(labels) + ['<PAD>']))}
        self.num_labels = len(self.label_to_idx)
        self.unknown_label_idx = self.label_to_idx['<PAD>']  # Index for unknown labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]
        label = self.labels[index] if index < len(self.labels) else 'unknown'  # Use 'unknown' if no label is available

        encoded_input = tokenizer.encode_plus(text, padding='max_length', max_length=50, return_tensors='pt')

        # Convert the label to a tensor
        if isinstance(label, (list, tuple)):
            # If the label is a sequence, convert it to a tensor
            encoded_label = torch.tensor([self.label_to_idx.get(l, self.unknown_label_idx) for l in label])
        else:
            # If the label is a single value, convert it to a tensor with the same sequence length
            encoded_label = torch.tensor([self.label_to_idx.get(label, self.unknown_label_idx)] * 50)

        return encoded_input, encoded_label

train_dataset = MyDataset(train_texts, train_labels)
valid_dataset = MyDataset(val_texts, val_labels)
test_dataset = MyDataset(test_texts, test_labels)

# Load model for token classification
model = RobertaForTokenClassification.from_pretrained("uhhlt/am-roberta", num_labels=train_dataset.num_labels)

for param in model.roberta.parameters():
    param.requires_grad = False  # Freeze the BERT layers

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Prepare DataLoader
def collate_fn(batch):
    if not batch:
        return None

    data_batch, label_batch = zip(*batch)

    padded_input_ids = torch.nn.utils.rnn.pad_sequence([enc['input_ids'] for enc in data_batch],
                                                       batch_first=True,
                                                       padding_value=tokenizer.pad_token_id)
    padded_attention_mask = torch.nn.utils.rnn.pad_sequence([enc['attention_mask'] for enc in data_batch],
                                                            batch_first=True)

    # Padding the labels
    if label_batch and isinstance(label_batch[0], torch.Tensor):
        # If the labels are already tensors, just pad them
        padded_label_batch = torch.nn.utils.rnn.pad_sequence(label_batch, batch_first=True, padding_value=train_dataset.unknown_label_idx)
    elif label_batch:
        # If the labels are lists, convert them to tensors and pad them
        max_label_len = max([len(labels) for labels in label_batch])
        padded_label_batch = torch.tensor([labels + [train_dataset.unknown_label_idx] * (max_label_len - len(labels)) for labels in label_batch])
    else:
        padded_label_batch = []

    return padded_input_ids, padded_attention_mask, padded_label_batch

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=4, pin_memory=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=4, pin_memory=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, num_workers=4, pin_memory=True, drop_last=True)

# Training loop
num_epochs = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

best_valid_loss = float('inf')
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    num_batches = 0
    for batch_input_ids, batch_attention_mask, batch_label_batch in train_loader:
        batch_input_ids = batch_input_ids.squeeze(1)
        batch_attention_mask = batch_attention_mask.squeeze(1)
        batch_label_batch = batch_label_batch

        batch_input_ids = batch_input_ids.to(device)
        batch_attention_mask = batch_attention_mask.to(device)
        batch_label_batch = batch_label_batch.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_label_batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        num_batches += 1

        # Printing the progress
        if (num_batches + 1) % 50 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{num_batches}], Loss: {loss.item():.4f}")

    average_train_loss = total_train_loss / num_batches
    print(f"Epoch {epoch + 1} - Average training loss: {average_train_loss:.4f}")

    model.eval()
    total_valid_loss = 0
    num_valid_batches = 0
    with torch.no_grad():
        for val_input_ids, val_attention_mask, val_label_batch in valid_loader:
            val_input_ids = val_input_ids.squeeze(1)
            val_attention_mask = val_attention_mask.squeeze(1)
            val_label_batch = val_label_batch

            val_input_ids = val_input_ids.to(device)
            val_attention_mask = val_attention_mask.to(device)
            val_label_batch = val_label_batch.to(device)

            outputs = model(input_ids=val_input_ids, attention_mask=val_attention_mask, labels=val_label_batch)
            loss = outputs.loss
            total_valid_loss += loss.item()
            num_valid_batches += 1

    average_valid_loss = total_valid_loss / num_valid_batches
    print(f"Epoch {epoch + 1} - Average validation loss: {average_valid_loss:.4f}")

    # Save the best model based on validation loss
    if average_valid_loss < best_valid_loss:
        best_valid_loss = average_valid_loss
        torch.save(model.state_dict(), 'best_model.pth')

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Inference function
def predict(test_loader, model, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for test_input_ids, test_attention_mask, test_label_batch in test_loader:
            test_input_ids = test_input_ids.squeeze(1)
            test_attention_mask = test_attention_mask.squeeze(1)
            test_label_batch = test_label_batch

            test_input_ids = test_input_ids.to(device)
            test_attention_mask = test_attention_mask.to(device)
            test_label_batch = test_label_batch.to(device)

            outputs = model(input_ids=test_input_ids, attention_mask=test_attention_mask, labels=test_label_batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=2).cpu().numpy()

            # Decode the predictions back to labels
            decoded_preds = [oe.inverse_transform(preds[i].reshape(-1, 1)) for i in range(len(preds))]
            predictions.extend(decoded_preds)

    return predictions

# Perform predictions
test_predictions = predict(test_loader, model, device)

# Post-process predictions
predicted_labels = []
for prediction in test_predictions:
    predicted_labels.extend(prediction.flatten().tolist())

print(predicted_labels[:10])  # Print first 10 predicted labels for inspection


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at uhhlt/am-roberta and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


Epoch [1/50], Batch [49], Loss: 10.2450
Epoch [1/50], Batch [99], Loss: 10.2266
Epoch [1/50], Batch [149], Loss: 10.2114
Epoch [1/50], Batch [199], Loss: 10.1949
Epoch [1/50], Batch [249], Loss: 10.1778
Epoch [1/50], Batch [299], Loss: 10.1619
Epoch [1/50], Batch [349], Loss: 10.1452
Epoch [1/50], Batch [399], Loss: 10.1269
Epoch [1/50], Batch [449], Loss: 10.1126
Epoch [1/50], Batch [499], Loss: 10.0977
Epoch [1/50], Batch [549], Loss: 10.0804
Epoch [1/50], Batch [599], Loss: 10.0523
Epoch [1/50], Batch [649], Loss: 10.0475
Epoch [1/50], Batch [699], Loss: 10.0255


In [5]:
from transformers import RobertaTokenizer, RobertaModel