# IMPORTS AND CHECKS

### IMPORTS

In [1]:
import os
import re
import torch
import json
import nltk
import unicodedata
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from datasets import Dataset
from typing import List, Tuple
from transformers import AutoTokenizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

  from .autonotebook import tqdm as notebook_tqdm


### CHECKS

In [2]:
# Check whether CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = torch.rand(3, 3).to(device)
print(f'Tensor is on: {x.device}')

Tensor is on: cuda:0


In [3]:
# print cuda version
print(torch.version.cuda)

12.1


In [4]:
# check the current working directory
print(os.getcwd())
os.chdir('C:/Users/abelm/OneDrive/Documents/GitHub/Microsoft-Learn-Location-Mention-Recognition-Challenge')

C:\Users\abelm


# DATA PREPARATION AND CLEANING 

### Explore Train Data

In [5]:
# Load the data
trainData = pd.read_csv('data/kaggle/Train_1-new.csv')
trainData.head()

Unnamed: 0,tweet_id,text,location
0,ID_1001136212718088192,,EllicottCity
1,ID_1001136696589631488,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,ID_1001136950345109504,State of emergency declared for Maryland flood...,Maryland
3,ID_1001137334056833024,Other parts of Maryland also saw significant d...,Baltimore Maryland
4,ID_1001138374923579392,"Catastrophic Flooding Slams Ellicott City, Mar...",Ellicott City Maryland


In [6]:
# print out shape 
print(f'The shape of the data is: {trainData.shape}')

# print out the number of missing values in each column
print(f'The number of missing values in the data are: {trainData.isnull().sum()}')

# drop the missing value rows
trainData = trainData.dropna()

# print out the number of missing values in each column after dropping missing values
print(f'The number of missing values in the data are: {trainData.isnull().sum()}')

# print out the shape of the data after dropping missing values
print(f'The shape of the data is: {trainData.shape}')

The shape of the data is: (73072, 3)
The number of missing values in the data are: tweet_id        0
text        56624
location    29612
dtype: int64
The number of missing values in the data are: tweet_id    0
text        0
location    0
dtype: int64
The shape of the data is: (11849, 3)


### CLEAN DATA

In [7]:
# initialize stopwords
stop_words = set(stopwords.words('english'))

# text clean helper function
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions (@user) and hashtags (#hashtag)
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove special characters, numbers, and punctuations, keeping spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove stopwords
    words = text.split()  # Split by spaces without tokenizing
    cleaned_words = [word for word in words if word not in stop_words]
    
    # Join words back into a single string
    return ' '.join(cleaned_words)

In [8]:
# clean the text
trainData['text'] = trainData['text'].apply(clean_text)

# covert text and location to string
trainData['text'] = trainData['text'].astype(str)
trainData['location'] = trainData['location'].astype(str)

# print out the number of missing values in each column after dropping missing values
print(f'The number of missing values in the data are: {trainData.isnull().sum()}')

# print out the shape of the data after dropping missing values
print(f'The shape of the data is: {trainData.shape}')

trainData.head()

The number of missing values in the data are: tweet_id    0
text        0
location    0
dtype: int64
The shape of the data is: (11849, 3)


Unnamed: 0,tweet_id,text,location
1,ID_1001136696589631488,Flash floods struck Maryland city Sunday washi...,Maryland
2,ID_1001136950345109504,State emergency declared Maryland flooding via,Maryland
3,ID_1001137334056833024,Other parts Maryland also saw significant dama...,Baltimore Maryland
4,ID_1001138374923579392,Catastrophic Flooding Slams Ellicott City Mary...,Ellicott City Maryland
5,ID_1001138377717157888,WATCH missing flash devastates Ellicott City M...,Ellicott City Maryland


### CREATE BIOES FILES

In [9]:
# separate data into train and dev
trainData, devData = train_test_split(trainData, test_size=0.2, random_state=42)

# save data to a csv tainData
trainData.to_csv('data/kaggle/Train-dropna.csv', index=False)

# save data to a csv devData
devData.to_csv('data/kaggle/Dev-dropna.csv', index=False)

In [10]:
# tokenize and label helper function
def tokenize_and_label(text, location):
    """
    Tokenize the tweet text and label tokens using BIOES format for location entities.
    
    Args:
    - text: A string representing the tweet.
    - location: A string representing the location entity to tag in the text.
    
    Returns:
    - List of tuples: Each tuple contains a token and its corresponding BIOES tag.
    """
    tokens = nltk.word_tokenize(text)  # Tokenize the tweet text
    labels = ['O'] * len(tokens)  # Initialize with 'O' tags for all tokens
    
    location_tokens = nltk.word_tokenize(location)  # Tokenize the location string
    start_idx = None
    
    # Find where the location starts in the tokenized text
    for i in range(len(tokens) - len(location_tokens) + 1):
        if tokens[i:i + len(location_tokens)] == location_tokens:
            start_idx = i
            break
    
    # If the location is found, assign the appropriate BIOES tags
    if start_idx is not None:
        if len(location_tokens) == 1:
            labels[start_idx] = 'S-LOC'  # Single-word location
        else:
            labels[start_idx] = 'B-LOC'  # Beginning of multi-word location
            labels[start_idx + len(location_tokens) - 1] = 'E-LOC'  # End of multi-word location
            for j in range(start_idx + 1, start_idx + len(location_tokens) - 1):
                labels[j] = 'I-LOC'  # Inside of multi-word location
    
    return list(zip(tokens, labels))

In [11]:
# convert to bioes helper function
def convert_to_bioes(csv_file, output_file):
    """
    Convert the text and location data from the CSV to BIOES format and save to a file.
    
    Args:
    - csv_file: Path to the input CSV file containing "text" and "location" columns.
    - output_file: Path to the output file where the BIOES formatted data will be saved.
    """
    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Assuming the CSV has columns: "text" and "location"
    bioes_data = []

    # Process each row
    for index, row in df.iterrows():
        tweet_text = row['text']
        location = row['location']  # Now it's just a single string, no list needed
        
        # Check if text or location is NaN or not a string, and skip that row if true
        if not isinstance(tweet_text, str) or not isinstance(location, str):
            print(f"Skipping row {index} due to invalid data.")
            continue

        # Get the tokens and their BIOES labels
        token_labels = tokenize_and_label(tweet_text, location)
        bioes_data.extend(token_labels)
        bioes_data.append(("", ""))  # Add a blank line between tweets

    # Define the directory from the output file path
    directory = os.path.dirname(output_file)

    # Check if the directory exists, if not, create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save the BIOES formatted data to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        for token, label in bioes_data:
            if token:  # Write token and label
                f.write(f"{token} {label}\n")
            else:  # Write a blank line between tweets
                f.write("\n")


In [12]:
# create BIOES formatted data
convert_to_bioes('data/kaggle/train-dropna.csv', 'data/kaggle/BIOES/train_bioes_file.txt')
convert_to_bioes('data/kaggle/dev-dropna.csv', 'data/kaggle/BIOES/dev_bioes_file.txt')

TypeError: expected string or bytes-like object, got 'float'

In [11]:
def read_bioes_file(file_path):
    sentences, labels = [], []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            sentence, label = [], []
            for line in file:
                if line.strip():
                    try:
                        word, tag = line.strip().split()
                        sentence.append(word)
                        label.append(tag)
                    except ValueError:
                        print(f"Skipping malformed line: {line.strip()}")
                else:
                    if sentence and label:
                        sentences.append(sentence)
                        labels.append(label)
                    sentence, label = [], []
        return sentences, labels
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except UnicodeDecodeError as e:
        print(f"Unicode decoding error: {e}")
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        
train_sentences, train_labels = read_bioes_file('data/kaggle/BIOES/train_bioes_file.txt')
dev_sentences, dev_labels = read_bioes_file('data/kaggle/BIOES/dev_bioes_file.txt')

In [None]:
print("Train sentences: ", len(train_sentences))
print("Train labels: ", len(train_labels))
print("Dev sentences: ", len(dev_sentences))
print("Dev labels: ", len(dev_labels))
print()

print("Train sentences: \n", train_sentences[0])
print("Train labels: \n", train_labels[0])
print("Dev sentences: \n", dev_sentences[0])
print("Dev labels: \n", dev_labels[0])

## Set up the label mapping

In [None]:
# Set up label mapping
all_labels = set()

for labels in train_labels + dev_labels:
    all_labels.update(labels)

label_list = sorted(list(all_labels))

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print(label2id)
print(id2label)

In [None]:
# Convert train_labels to IDs
train_labels_ids = [[label2id[label] for label in sentence_labels] for sentence_labels in train_labels]
dev_labels_ids = [[label2id[label] for label in sentence_labels] for sentence_labels in dev_labels]

# Example usage
print("Original first sentence labels:", train_labels[0])
print("Converted first sentence label IDs:", train_labels_ids[0])

print("Valid length: ", len(train_labels) == len(train_labels_ids))

## Preprocess sentence and label from BIOES to index

In [15]:
def preprocess_sentence_and_labels(sentence, labels):
    processed_sentence = []
    processed_labels = []

    for word, label in zip(sentence, labels):
        # Remove words with special characters or numbers
        if not re.match(r'^[a-zA-Z]+$', word):
            continue

        # If the word is not empty after processing, keep it and its label
        if word:
            processed_sentence.append(word)
            processed_labels.append(label)

    return processed_sentence, processed_labels

# Process the training data
processed_train_sentences = []
processed_train_labels_ids = []

# Process the training data
processed_dev_sentences = []
processed_dev_labels_ids = []

for sentence, labels in zip(train_sentences, train_labels_ids):
    proc_sentence, proc_labels = preprocess_sentence_and_labels(sentence, labels)
    processed_train_sentences.append(proc_sentence)
    processed_train_labels_ids.append(proc_labels)

for sentence, labels in zip(dev_sentences, dev_labels_ids):
    proc_sentence, proc_labels = preprocess_sentence_and_labels(sentence, labels)
    processed_dev_sentences.append(proc_sentence)
    processed_dev_labels_ids.append(proc_labels)

In [None]:
# Print an example to compare
print("Original sentence:", train_sentences[2])
print("Original labels:", train_labels_ids[2])
print("\nProcessed sentence:", processed_train_sentences[2])
print(len(processed_train_sentences[2]))
print("Processed labels:", processed_train_labels_ids[2])
print(len(processed_train_labels_ids[2]))

# Print some statistics
original_word_count = sum(len(sentence) for sentence in train_sentences)
processed_word_count = sum(len(sentence) for sentence in processed_train_sentences)
print(f"\nOriginal word count: {original_word_count}")
print(f"Processed word count: {processed_word_count}")
print(f"Removed {original_word_count - processed_word_count} words")

In [None]:
# Load the tokenizer
CT_M3_Complete_tokenizer = AutoTokenizer.from_pretrained("crisistransformers/CT-M3-Complete")

def tokenize_and_adjust_labels(sentence: List[str], labels: List[int], tokenizer, max_length: int) -> Tuple[List[int], List[int]]:
    tokenized_input = tokenizer(
        sentence,
        is_split_into_words=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Debugging: Print the shape of the tokenized input
    print("Tokenized input shape:", tokenized_input["input_ids"].shape)

    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"][0].tolist())  # Convert to list

    # Debugging: Check tokens and their count
    print("Tokens:", tokens)
    print("Number of tokens:", len(tokens))

    updated_labels = []
    current_label_idx = 0

    for token in tokens:
        if token in ['<s>', '</s>', '<unk>']:
            updated_labels.append(-100)
        elif token.endswith('@@'):  # Handle word pieces ending with @@
            if current_label_idx < len(labels):
                updated_labels.append(labels[current_label_idx])
        else:
            if current_label_idx < len(labels):
                updated_labels.append(labels[current_label_idx])
                current_label_idx += 1
            else:
                updated_labels.append(-100)  # Handle case when there are no more labels

    return tokenized_input["input_ids"][0].tolist(), updated_labels  # Return as list

# Set the maximum length for tokenization
max_length = 2000  # Adjust this as needed

# Apply the function to all sentences and labels
tokenized_train_inputs = []
adjusted_train_labels = []

tokenized_dev_inputs = []
adjusted_dev_labels = []

for sentence, labels in zip(processed_train_sentences, processed_train_labels_ids):
    input_ids, adjusted_labels = tokenize_and_adjust_labels(sentence, labels, CT_M3_Complete_tokenizer, max_length)
    tokenized_train_inputs.append(input_ids)
    adjusted_train_labels.append(adjusted_labels)

for sentence, labels in zip(processed_dev_sentences, processed_dev_labels_ids):
    input_ids, adjusted_labels = tokenize_and_adjust_labels(sentence, labels, CT_M3_Complete_tokenizer, max_length)
    tokenized_dev_inputs.append(input_ids)
    adjusted_dev_labels.append(adjusted_labels)

In [None]:
# Print an example to verify
print("Original sentence:", processed_train_sentences[2])
print("Original labels:", processed_train_labels_ids[2])
print("\nTokenized input:", tokenized_train_inputs[2])
print("Adjusted labels:", adjusted_train_labels[2])

# Verify lengths
print("\nLength of tokenized input:", len(tokenized_train_inputs[2]))
print("Length of adjusted labels:", len(adjusted_train_labels[2]))

# Print some statistics
original_sentence_count = len(input_ids)
tokenized_sentence_count = len(tokenized_train_inputs)
print(f"\nNumber of original sentences: {original_sentence_count}")
print(f"Number of tokenized sentences: {tokenized_sentence_count}")

average_original_length = sum(len(s) for s in processed_train_sentences) / original_sentence_count
average_tokenized_length = sum(len(s) for s in tokenized_train_inputs) / tokenized_sentence_count
print(f"\nAverage original sentence length: {average_original_length:.2f}")
print(f"Average tokenized sentence length: {average_tokenized_length:.2f}")

In [None]:
print(len(input_ids), len(adjusted_labels))  # Check lengths here

In [None]:
for sentence, labels in zip(processed_train_sentences, processed_train_labels_ids):
    input_ids, adjusted_labels = tokenize_and_adjust_labels(sentence, labels, CT_M3_Complete_tokenizer, max_length=2000)
    print(f"Input IDs shape: {len(input_ids)}")  # Print length of input IDs
    print(f"Adjusted labels shape: {len(adjusted_labels)}")  # Print length of adjusted labels

## Load the dataset

In [22]:
# Convert to datasets
tokenized_train = Dataset.from_dict({
    "input_ids": tokenized_train_inputs,
    "labels": adjusted_train_labels
})
tokenized_dev = Dataset.from_dict({
    "input_ids": tokenized_dev_inputs,
    "labels": adjusted_dev_labels
})

# Set up label mapping
all_labels = set()

for labels in train_labels + dev_labels:
    all_labels.update(labels)

label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

labels = sorted(list(all_labels))

label_list = sorted(list(all_labels))

# Model

## Model configuration

In [None]:
model_name = "crisistransformers/CT-M3-Complete"

# Update model configuration
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(label_list)
config.id2label = id2label
config.label2id = label2id

CT_M3_Complete_model = AutoModelForTokenClassification.from_pretrained(model_name, config=config)
CT_M3_Complete_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [26]:
def compute_metrics(p):
    predictions, labels = p
    print(f"Predictions shape: {predictions.shape}, Labels shape: {labels.shape}")
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = precision_recall_fscore_support(sum(true_labels, []), sum(true_predictions, []), average='weighted')
    return {
        "precision": results[0],
        "recall": results[1],
        "f1": results[2],
    }

## Training

In [27]:
# Set up model and tokenizer
model_name = "crisistransformers/CT-M3-Complete"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = len(label_list)
config.id2label = id2label
config.label2id = label2id

In [None]:
# Set up data collator
data_collator = DataCollatorForTokenClassification(tokenizer=CT_M3_Complete_tokenizer, padding=True)

class CustomTrainer(Trainer):
    def training_step(self, model, inputs):
        # Print input shapes here
        print(f"Inputs: {inputs['input_ids'].shape}, Labels: {inputs['labels'].shape}")
        return super().training_step(model, inputs)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="data/kaggle/CrisisTransformers",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",  # Use PyTorch's AdamW implementation
    logging_steps=100,  # Reduce logging frequency
    save_total_limit=2,  # Keep only the last 2 checkpoints
    report_to='none',  # Disable logging to wandb
)

# Set up trainer
CustomTrainer = Trainer(
    model=CT_M3_Complete_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=CT_M3_Complete_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
print(f"Training dataset size: {len(tokenized_train_inputs)}")
print(f"First training input shape: {len(tokenized_train_inputs[0])}")
print(f"First training label shape: {len(adjusted_train_labels[0])}")

In [None]:
# Start training
CustomTrainer.train()

## Evaluate the model

In [None]:
# Evaluate the model
eval_results = CustomTrainer.evaluate()
print(eval_results)

In [None]:
# After training
output_dir = "data/kaggle/working/results"

# Save the model
CustomTrainer.save_model(output_dir)

# Save the tokenizer
CT_M3_Complete_tokenizer.save_pretrained(output_dir)

# Save training arguments
with open(f"{output_dir}/training_args.json", 'w') as f:
    json.dump(training_args.to_dict(), f)

# Save label mappings
with open(f"{output_dir}/label_mappings.json", 'w') as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f)

print(f"Model and associated files saved to {output_dir}")

In [None]:
# Load the model and tokenizer
model = AutoModelForTokenClassification.from_pretrained("data/kaggle/working/results")
tokenizer = AutoTokenizer.from_pretrained("data/kaggle/working/results")

# Load label mappings
with open("data/kaggle/working/results/label_mappings.json", 'r') as f:
    label_mappings = json.load(f)

id2label = label_mappings["id2label"]
print(id2label)

# Submission

In [38]:
def merge_subwords_and_locations(tokens_and_labels):
    merged_words = []
    merged_labels = []
    current_word = []
    current_labels = []
    location_buffer = []

    for token, label in tokens_and_labels:
        if token.endswith('@@'):
            current_word.append(token[:-2])  # Remove '@@'
            current_labels.append(label)
        else:
            current_word.append(token)
            current_labels.append(label)

            # Merge subwords
            merged_word = ''.join(current_word)

            # Voting for the label
            if len(set(current_labels)) == 1:
                merged_label = current_labels[0]
            else:
                priority_order = ['B-LOC', 'I-LOC', 'E-LOC', 'S-LOC', 'O']
                merged_label = next(label for label in priority_order if label in current_labels)

            # Handle location merging
            if merged_label.endswith('-LOC'):
                if merged_label == 'B-LOC' or merged_label == 'S-LOC':
                    if location_buffer:
                        merged_words.append(' '.join(location_buffer))
                        merged_labels.append('B-LOC')
                        location_buffer = []
                    location_buffer.append(merged_word)
                elif merged_label == 'I-LOC' or merged_label == 'E-LOC':
                    location_buffer.append(merged_word)
                    if merged_label == 'E-LOC':
                        merged_words.append(' '.join(location_buffer))
                        merged_labels.append('B-LOC')
                        location_buffer = []
            else:
                if location_buffer:
                    merged_words.append(' '.join(location_buffer))
                    merged_labels.append('B-LOC')
                    location_buffer = []
                merged_words.append(merged_word)
                merged_labels.append(merged_label)

            # Reset for next word
            current_word = []
            current_labels = []

    # Handle any remaining location in the buffer
    if location_buffer:
        merged_words.append(' '.join(location_buffer))
        merged_labels.append('B-LOC')

    return list(zip(merged_words, merged_labels))

# # Usage
# merged_result = merge_subwords_and_locations(predicted_tokens)

# # Extract locations
# locations = [word for word, label in merged_result if label == 'B-LOC']
# print("\nExtracted locations:", locations)

In [39]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(**inputs).logits

    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

    tokens = []
    predicted_tokens = []

    locations = []
    current_location = []

    for token, prediction in zip(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]), predictions[0]):
        if int(prediction) == 0:  # Beginning of a new location
            current_location = [token]
        elif int(prediction) == 2:  # Inside a location
            if current_location:  # Make sure we started a location
                current_location.append(token)
        elif int(prediction) == 1:  # End of a location
            if current_location:  # Make sure we're inside a location
                current_location.append(token)
                locations.append(" ".join(current_location))
                current_location = []
        elif int(prediction) == 4:  # Single token location
            locations.append(token)
        else:
            current_location = []  # Reset if prediction is 'O' or anything else

        # Remove special tokens and clean up the text
        if token not in ['<s>', '</s>', '<unk>']:
            cleaned_token = token[1:] if token.startswith('Ġ') else token

            if token.startswith('##'):
                if predicted_tokens:
                    predicted_tokens[-1] = (predicted_tokens[-1][0] + cleaned_token, predicted_tokens[-1][1])
                continue

            tokens.append(cleaned_token)
            predicted_tokens.append((cleaned_token, id2label[str(prediction.item())]))

    # Usage
    merged_result = merge_subwords_and_locations(predicted_tokens)

    # Extract locations
    locations = [word for word, label in merged_result if label == 'B-LOC']

    # Extract unique locations and sort alphabetically
    unique_locations = sorted(set(locations))

    return unique_locations, tokens, predictions, predicted_tokens

In [None]:
test = pd.read_csv("data/kaggle/test-new.csv")

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.data.path.append('/usr/share/nltk_data/')

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '<URL>', text, flags=re.MULTILINE)

    # Remove user mentions
    text = re.sub(r'@\w+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z0-9\s\./\-_]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

#     # Lemmatize
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    processed_text = ' '.join(tokens)

    return processed_text

# Apply preprocessing to each text in your dataset
test['processed_text'] = test['text'].apply(preprocess_text)

In [None]:
submission = []

for index, row in test.iterrows():
    if index % 100 == 0:
        print(f"Processing row {index}")

    id = row['tweet_id']
    processed_text = row['processed_text']

    unique_locations, tokens, predictions, predicted_tokens = predict(processed_text)

    # Join locations with space, or use a single space if no locations
    locations_string = ' '.join(unique_locations) if unique_locations else ' '

    submission.append({'ID': id, 'Locations': locations_string})

# Create DataFrame from submission list
submission_df = pd.DataFrame(submission)

# Save to CSV
submission_df.to_csv('data/kaggle/submission.csv', index=False)

In [None]:
submission_df.head()