In [1]:
import os
import spacy
import torch
import re
import random
import pandas as pd

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

from spacy.training import Example
from spacy.util import minibatch, compounding


Cuda and stuff

In [2]:
# Check whether CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
x = torch.rand(3, 3).to(device)
print(f'Tensor is on: {x.device}')

Tensor is on: cuda:0


In [3]:
# print cuda version
print(torch.version.cuda)

12.1


In [4]:
# check the current working directory
#os.chdir('C:/Users/lamem/OneDrive/Documents/GHD/Microsoft-Learn-Location-Mention-Recognition-Challenge')
os.chdir('C:/Users/abelm/OneDrive/Documents/GitHub/Microsoft-Learn-Location-Mention-Recognition-Challenge')
print(os.getcwd())

C:\Users\abelm\OneDrive\Documents\GitHub\Microsoft-Learn-Location-Mention-Recognition-Challenge


# DATA

In [5]:
# get stop words
stop_words = set(stopwords.words('english'))

In [6]:
# helper function to clean text
def clean_text(text):
    
    # remove links
    text = re.sub(r'http\S+', '', text)
    # remove mentions
    text = re.sub(r'@\w+', '', text)
    # remove hashtags
    text = re.sub(r'#\w+', '', text)
    # remove digits
    text = re.sub(r'\d+', '', text)
    # remove html tags
    text = re.sub('r<.*?>', '', text)
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # remove stopwords
    text = ' '.join(word for word in text.split() if word.lower() not in stop_words)
    # remove symbols
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    # return cleaned text
    return text

In [7]:
# load train data
train = pd.read_csv("lewa/Train_1.csv")

# drop empty text columns
train = train.dropna(subset=['text'])

# clean text
train['text'] = train['text'].apply(lambda x: clean_text(x))

# add an empty string to the location column with missing values (Nan)
train['location'] = train['location'].fillna(' ')

train.head()

Unnamed: 0,tweet_id,text,location
1,ID_1001136696589631488,Flash floods struck Maryland city Sunday washi...,Maryland
2,ID_1001136950345109504,State emergency declared Maryland flooding via,Maryland
3,ID_1001137334056833024,parts Maryland also saw significant damage Sun...,Baltimore Maryland
4,ID_1001138374923579392,Catastrophic Flooding Slams Ellicott City Mary...,Ellicott City Maryland
5,ID_1001138377717157888,WATCH missing flash devastates Ellicott City M...,Ellicott City Maryland


In [8]:
# Convert DataFrame to spaCy's training format
TRAIN_DATA = []
for index, row in train.iterrows():
    text = row['text']
    location_str = row['location']
    
    if location_str:
        locations = location_str.split()
        entities = []
        used_indices = set()  # Track used indices to avoid overlaps
        
        for location in locations:
            start_idx = text.find(location)
            end_idx = start_idx + len(location)
            if start_idx != -1 and not any(idx in used_indices for idx in range(start_idx, end_idx)):
                entities.append((start_idx, end_idx, "GPE"))
                used_indices.update(range(start_idx, end_idx))  # Mark indices as used

        if entities:
            TRAIN_DATA.append((text, {"entities": entities}))
    else:
        TRAIN_DATA.append((text, {"entities": []}))

# SPACY MODEL

In [9]:
# Step 1: Load the pre-trained model
nlp = spacy.load("en_core_web_lg")

In [None]:
# Step 2: Create the training pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")

ner.add_label("GPE")  # Add your custom labels here

# Step 3: Split the data into training and validation sets
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)

# Step 4: Fine-tune the model with monitoring
# Set parameters for the training
n_iter = 10  # Number of training iterations
batch_sizes = compounding(1.0, 4.0, 1.001)

# Track training losses and evaluation metrics
train_losses = []
val_losses = []
val_correct = 0
val_total = 0
val_gold_total = 0

# Start training
for epoch in range(n_iter):
    random.shuffle(train_data)  # Shuffle the training data

    losses = {}
    # Create minibatches for training
    for batch in minibatch(train_data, size=8):  # Adjust size as needed
        for text, annotations in batch:
            # Create an Example object
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            
            # Update the model and collect losses
            nlp.update([example], drop=0.5, losses=losses)  # Dropout for regularization
    
    # Append the training loss for monitoring
    train_losses.append(losses["ner"])  # Append NER loss

    # Validation step
    val_loss = 0  # Reset validation loss for each epoch
    for text, annotations in val_data:
        doc = nlp(text)  # Process the document
        predicted_entities = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}
        gold_entities = {(start, end, label) for start, end, label in annotations.get("entities")}

        val_correct += len(predicted_entities & gold_entities)  # Count the correctly predicted entities
        val_total += len(predicted_entities)  # Count the number of entities detected by the model
        val_gold_total += len(gold_entities)  # Count the number of gold-standard entities

    # Calculate precision, recall, and F1-score for the validation set
    precision = val_correct / val_total if val_total > 0 else 0
    recall = val_correct / val_gold_total if val_gold_total > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Print epoch results
    print(f"Epoch {epoch + 1}/{n_iter}, Train Loss: {losses.get('ner', 0):.3f}, "
        f"Val Precision: {precision:.2f}, Val Recall: {recall:.2f}, Val F1 Score: {f1_score:.2f}")
    
# Step 5: Save the fine-tuned model
nlp.to_disk("spacy/fine_tuned_model")

print("Model training complete and saved as 'fine_tuned_model'.")



Epoch 1/10, Train Loss: 7399.740, Val Precision: 0.83, Val Recall: 0.87, Val F1 Score: 0.85


## SUBMISSION BASELINE

In [None]:
# load the test data
test = pd.read_csv("lewa/Test.csv")

In [None]:
# clean text
test['text'] = test['text'].apply(lambda x: clean_text(x))

In [38]:
# Function to extract locations from text using spaCy
def extract_locations(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']  # GPE stands for Geopolitical Entity (locations)
    return ' '.join(locations) if locations else ' ' # join locations with space if many

In [39]:
# Create a new dataframe with extracted locations
submission = pd.DataFrame({
    'tweet_id': test['tweet_id'],
    'locations': test['text'].apply(extract_locations)
})

In [40]:
submission.head()
# save the submission to a csv
submission.to_csv('lewa/spacy-baseline-submission.csv', index=False)

## FINED TUNED SUBMISSION

In [None]:
# load fine tuned model
nlp = spacy.load("spacy/fine_tuned_model")

# Function to extract locations from text using spaCy
def extract_locations(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']  # GPE stands for Geopolitical Entity (locations)
    return ' '.join(locations) if locations else ' ' # join locations with space if many

# Create a new dataframe with extracted locations
submission = pd.DataFrame({
    'tweet_id': test['tweet_id'],
    'locations': test['text'].apply(extract_locations)
})  

submission.head()
# save the submission to a csv
submission.to_csv('lewa/spacy-fine-tuned-submission.csv', index=False)