In [None]:
import pandas as pd

## Data

In [None]:
def read_bioes_file(file_path):
    sentences, labels = [], []
    with open(file_path, 'r') as file:
        sentence, label = [], []
        for line in file:
            if line.strip():
                word, tag = line.strip().split()
                sentence.append(word)
                label.append(tag)
            else:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
    return sentences, labels

train_sentences, train_labels = read_bioes_file('/kaggle/merged_dev_bioes.txt')
dev_sentences, dev_labels = read_bioes_file('/kaggle/merged_train_bioes.txt')

In [None]:
print("Train sentences: ", len(train_sentences))
print("Train labels: ", len(train_labels))
print("Dev sentences: ", len(dev_sentences))
print("Dev labels: ", len(dev_labels))
print()

print("Train sentences: \n", train_sentences[0])
print("Train labels: \n", train_labels[0])
print("Dev sentences: \n", dev_sentences[0])
print("Dev labels: \n", dev_labels[0])

## Set up the label mapping

In [None]:
# Set up label mapping
all_labels = set()

for labels in train_labels + dev_labels:
    all_labels.update(labels)

label_list = sorted(list(all_labels))

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print(label2id)
print(id2label)

In [None]:
# Convert train_labels to IDs
train_labels_ids = [[label2id[label] for label in sentence_labels] for sentence_labels in train_labels]
dev_labels_ids = [[label2id[label] for label in sentence_labels] for sentence_labels in dev_labels]

# Example usage
print("Original first sentence labels:", train_labels[0])
print("Converted first sentence label IDs:", train_labels_ids[0])

print("Valid length: ", len(train_labels) == len(train_labels_ids))

## Preprocess sentence and label from BIOES to index

In [None]:
import re
import unicodedata

def preprocess_sentence_and_labels(sentence, labels):
    processed_sentence = []
    processed_labels = []

    for word, label in zip(sentence, labels):
        # Remove words with special characters or numbers
        if not re.match(r'^[a-zA-Z]+$', word):
            continue

        # If the word is not empty after processing, keep it and its label
        if word:
            processed_sentence.append(word)
            processed_labels.append(label)

    return processed_sentence, processed_labels

# Process the training data
processed_train_sentences = []
processed_train_labels_ids = []

# Process the training data
processed_dev_sentences = []
processed_dev_labels_ids = []

for sentence, labels in zip(train_sentences, train_labels_ids):
    proc_sentence, proc_labels = preprocess_sentence_and_labels(sentence, labels)
    processed_train_sentences.append(proc_sentence)
    processed_train_labels_ids.append(proc_labels)

for sentence, labels in zip(dev_sentences, dev_labels_ids):
    proc_sentence, proc_labels = preprocess_sentence_and_labels(sentence, labels)
    processed_dev_sentences.append(proc_sentence)
    processed_dev_labels_ids.append(proc_labels)

In [None]:
# Print an example to compare
print("Original sentence:", train_sentences[2])
print("Original labels:", train_labels_ids[2])
print("\nProcessed sentence:", processed_train_sentences[2])
print(len(processed_train_sentences[2]))
print("Processed labels:", processed_train_labels_ids[2])
print(len(processed_train_labels_ids[2]))

# Print some statistics
original_word_count = sum(len(sentence) for sentence in train_sentences)
processed_word_count = sum(len(sentence) for sentence in processed_train_sentences)
print(f"\nOriginal word count: {original_word_count}")
print(f"Processed word count: {processed_word_count}")
print(f"Removed {original_word_count - processed_word_count} words")

In [None]:
from typing import List, Tuple
from transformers import AutoTokenizer

CT_M3_Complete_tokenizer = AutoTokenizer.from_pretrained("crisistransformers/CT-M3-Complete")

def tokenize_and_adjust_labels(sentence: List[str], labels: List[int], tokenizer) -> Tuple[List[int], List[int]]:
    tokenized_input = tokenizer(sentence, is_split_into_words=True)
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

    updated_labels = []
    current_label_idx = 0

    for token in tokens:
        if token in ['<s>', '</s>', '<unk>']:
            updated_labels.append(-100)
        elif token.endswith('@@'):  # Handle word pieces ending with @@
            updated_labels.append(labels[current_label_idx])
        else:
            updated_labels.append(labels[current_label_idx])
            current_label_idx += 1

    return tokenized_input["input_ids"], updated_labels

# Apply the function to all sentences and labels
tokenized_train_inputs = []
adjusted_train_labels = []

tokenized_dev_inputs = []
adjusted_dev_labels = []

for sentence, labels in zip(processed_train_sentences, processed_train_labels_ids):
    input_ids, adjusted_labels = tokenize_and_adjust_labels(sentence, labels, CT_M3_Complete_tokenizer)
    tokenized_train_inputs.append(input_ids)
    adjusted_train_labels.append(adjusted_labels)

for sentence, labels in zip(processed_dev_sentences, processed_dev_labels_ids):
    input_ids, adjusted_labels = tokenize_and_adjust_labels(sentence, labels, CT_M3_Complete_tokenizer)
    tokenized_dev_inputs.append(input_ids)
    adjusted_dev_labels.append(adjusted_labels)

In [None]:
# Print an example to verify
print("Original sentence:", processed_train_sentences[2])
print("Original labels:", processed_train_labels_ids[2])
print("\nTokenized input:", tokenized_train_inputs[2])
print("Adjusted labels:", adjusted_train_labels[2])
​
# Verify lengths
print("\nLength of tokenized input:", len(tokenized_train_inputs[2]))
print("Length of adjusted labels:", len(adjusted_train_labels[2]))
​
# Print some statistics
original_sentence_count = len(input_ids)
tokenized_sentence_count = len(tokenized_train_inputs)
print(f"\nNumber of original sentences: {original_sentence_count}")
print(f"Number of tokenized sentences: {tokenized_sentence_count}")
​
average_original_length = sum(len(s) for s in processed_train_sentences) / original_sentence_count
average_tokenized_length = sum(len(s) for s in tokenized_train_inputs) / tokenized_sentence_count
print(f"\nAverage original sentence length: {average_original_length:.2f}")
print(f"Average tokenized sentence length: {average_tokenized_length:.2f}")

## Load the dataset

In [None]:
from datasets import Dataset

# Convert to datasets
tokenized_train = Dataset.from_dict({
    "input_ids": tokenized_train_inputs,
    "labels": adjusted_train_labels
})
tokenized_dev = Dataset.from_dict({
    "input_ids": tokenized_dev_inputs,
    "labels": adjusted_dev_labels
})

# Set up label mapping
all_labels = set()

for labels in train_labels + dev_labels:
    all_labels.update(labels)

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

labels = sorted(list(all_labels))

label_list = sorted(list(all_labels))

# Model

## Model configuration

In [None]:
from transformers import AutoConfig, AutoModel
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoModelForMaskedLM

model_name = "crisistransformers/CT-M3-Complete"

# Update model configuration
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(label_list)
config.id2label = id2label
config.label2id = label2id

CT_M3_Complete_model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)
CT_M3_Complete_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = precision_recall_fscore_support(sum(true_labels, []), sum(true_predictions, []), average='weighted')
    return {
        "precision": results[0],
        "recall": results[1],
        "f1": results[2],
    }

## Training

In [None]:
import numpy as np
import pandas as pd

import torch
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set up model and tokenizer
model_name = "crisistransformers/CT-M3-Complete"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(label_list)
config.id2label = id2label
config.label2id = label2id

In [None]:
# Set up data collator
data_collator = DataCollatorForTokenClassification(tokenizer=CT_M3_Complete_tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="content/drive/MyDrive/CrisisTransformers",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",  # Use PyTorch's AdamW implementation
    logging_steps=100,  # Reduce logging frequency
    save_total_limit=2,  # Keep only the last 2 checkpoints
    report_to='none',  # Disable logging to wandb
)

# Set up trainer
trainer = Trainer(
    model=CT_M3_Complete_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=CT_M3_Complete_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# Start training
trainer.train()

## Evaluate the model

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
import json
import pandas as pd
import torch
import re
from transformers import AutoTokenizer, AutoModelForTokenClassification

# After training
output_dir = "/kaggle/working/results"

# Save the model
trainer.save_model(output_dir)

# Save the tokenizer
CT_M3_Complete_tokenizer.save_pretrained(output_dir)

# Save training arguments
with open(f"{output_dir}/training_args.json", 'w') as f:
    json.dump(training_args.to_dict(), f)

# Save label mappings
with open(f"{output_dir}/label_mappings.json", 'w') as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f)

print(f"Model and associated files saved to {output_dir}")

In [None]:
# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("/kaggle/working/results")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/results")

# Load label mappings
with open("/kaggle/working/results/label_mappings.json", 'r') as f:
    label_mappings = json.load(f)

id2label = label_mappings["id2label"]
print(id2label)

# Submission

In [None]:
def merge_subwords_and_locations(tokens_and_labels):
    merged_words = []
    merged_labels = []
    current_word = []
    current_labels = []
    location_buffer = []

    for token, label in tokens_and_labels:
        if token.endswith('@@'):
            current_word.append(token[:-2])  # Remove '@@'
            current_labels.append(label)
        else:
            current_word.append(token)
            current_labels.append(label)

            # Merge subwords
            merged_word = ''.join(current_word)

            # Voting for the label
            if len(set(current_labels)) == 1:
                merged_label = current_labels[0]
            else:
                priority_order = ['B-LOC', 'I-LOC', 'E-LOC', 'S-LOC', 'O']
                merged_label = next(label for label in priority_order if label in current_labels)

            # Handle location merging
            if merged_label.endswith('-LOC'):
                if merged_label == 'B-LOC' or merged_label == 'S-LOC':
                    if location_buffer:
                        merged_words.append(' '.join(location_buffer))
                        merged_labels.append('B-LOC')
                        location_buffer = []
                    location_buffer.append(merged_word)
                elif merged_label == 'I-LOC' or merged_label == 'E-LOC':
                    location_buffer.append(merged_word)
                    if merged_label == 'E-LOC':
                        merged_words.append(' '.join(location_buffer))
                        merged_labels.append('B-LOC')
                        location_buffer = []
            else:
                if location_buffer:
                    merged_words.append(' '.join(location_buffer))
                    merged_labels.append('B-LOC')
                    location_buffer = []
                merged_words.append(merged_word)
                merged_labels.append(merged_label)

            # Reset for next word
            current_word = []
            current_labels = []

    # Handle any remaining location in the buffer
    if location_buffer:
        merged_words.append(' '.join(location_buffer))
        merged_labels.append('B-LOC')

    return list(zip(merged_words, merged_labels))

# # Usage
# merged_result = merge_subwords_and_locations(predicted_tokens)

# # Extract locations
# locations = [word for word, label in merged_result if label == 'B-LOC']
# print("\nExtracted locations:", locations)

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(**inputs).logits

    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

    tokens = []
    predicted_tokens = []

    locations = []
    current_location = []

    for token, prediction in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), predictions[0]):
        if int(prediction) == 0:  # Beginning of a new location
            current_location = [token]
        elif int(prediction) == 2:  # Inside a location
            if current_location:  # Make sure we started a location
                current_location.append(token)
        elif int(prediction) == 1:  # End of a location
            if current_location:  # Make sure we're inside a location
                current_location.append(token)
                locations.append(" ".join(current_location))
                current_location = []
        elif int(prediction) == 4:  # Single token location
            locations.append(token)
        else:
            current_location = []  # Reset if prediction is 'O' or anything else

        # Remove special tokens and clean up the text
        if token not in ['<s>', '</s>', '<unk>']:
            cleaned_token = token[1:] if token.startswith('Ġ') else token

            if token.startswith('##'):
                if predicted_tokens:
                    predicted_tokens[-1] = (predicted_tokens[-1][0] + cleaned_token, predicted_tokens[-1][1])
                continue

            tokens.append(cleaned_token)
            predicted_tokens.append((cleaned_token, id2label[str(prediction.item())]))

    # Usage
    merged_result = merge_subwords_and_locations(predicted_tokens)

    # Extract locations
    locations = [word for word, label in merged_result if label == 'B-LOC']

    # Extract unique locations and sort alphabetically
    unique_locations = sorted(set(locations))

    return unique_locations, tokens, predictions, predicted_tokens

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

test = pd.read_csv("/kaggle/input/zindi-learn-location-mention-recognition-challenge/Test.csv")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import nltk
nltk.data.path.append('/usr/share/nltk_data/')

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '<URL>', text, flags=re.MULTILINE)

    # Remove user mentions
    text = re.sub(r'@\w+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z0-9\s\./\-_]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

#     # Lemmatize
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    processed_text = ' '.join(tokens)

    return processed_text

# Apply preprocessing to each text in your dataset
test['processed_text'] = test['text'].apply(preprocess_text)

In [None]:
submission_df.head()

In [None]:
!head -n 5 /kaggle/submission.csv