In [6]:
from gensim.models import FastText

In [1]:
from datasets import load_dataset

# Load the CoNLL 2003 dataset
dataset = load_dataset('conll2003')


  from .autonotebook import tqdm as notebook_tqdm
Downloading builder script: 100%|█████████████████████████████████████████████████████████| 9.57k/9.57k [00:00<?, ?B/s]
Downloading metadata: 100%|███████████████████████████████████████████████████████████████| 3.73k/3.73k [00:00<?, ?B/s]
Downloading readme: 100%|█████████████████████████████████████████████████████████████████| 12.3k/12.3k [00:00<?, ?B/s]
Downloading data: 100%|██████████████████████████████████████████████████████████████| 983k/983k [00:01<00:00, 856kB/s]
Generating train split: 100%|███████████████████████████████████████████| 14041/14041 [00:01<00:00, 9789.68 examples/s]
Generating validation split: 100%|████████████████████████████████████████| 3250/3250 [00:00<00:00, 8976.28 examples/s]
Generating test split: 100%|██████████████████████████████████████████████| 3453/3453 [00:00<00:00, 9776.00 examples/s]


In [13]:
# Create a FastText model
fastmodel = FastText(dataset['train']['tokens'])


In [118]:
import torch
from torch import nn
from torch.optim import Adam

class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # Create an initial hidden state of zeros
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device) 
        out, _ = self.rnn(x.unsqueeze(1), h0)
        out = self.fc(out.squeeze(1))
        return out

flat_labels = [label for sublist in dataset['train']['ner_tags'] for label in sublist]

# Get the unique ner_tags
unique_labels = set(flat_labels)
# Initialize the model
model = SimpleRNN(input_size=100, hidden_size=32, output_size=len(unique_labels))

# Define a loss function and an optimizer
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Convert words to embeddings
inputs_train = [fastmodel.wv[word] for word in dataset['train']['tokens']]
inputs_train = [torch.tensor(sentence).float() for sentence in inputs_train]
labels_train = dataset['train']['ner_tags']
labels_train = [torch.tensor(label) for label in labels_train]


inputs_valid = [fastmodel.wv[word] for word in dataset['validation']['tokens']]
inputs_valid = [torch.tensor(sentence).float() for sentence in inputs_valid]
labels_valid = dataset['validation']['ner_tags']
labels_valid = [torch.tensor(label) for label in labels_valid]

inputs_test = [fastmodel.wv[word] for word in dataset['test']['tokens']]
inputs_test = [torch.tensor(sentence).float() for sentence in inputs_test]
labels_test = dataset['test']['ner_tags']
labels_test = [torch.tensor(label) for label in labels_test]

In [120]:
from sklearn.metrics import classification_report

# Train the model
for epoch in range(100):  # number of epochs
    true_labels = []
    pred_labels = []
    for i in range(len(inputs_train)):  # number of sentences in the dataset
        # Forward pass
        outputs = model.forward(inputs_train[i])
        
        # Calculate the loss
        loss = criterion(outputs, labels_train[i])
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/{100}, Loss: {loss.item()}')
        true_labels = []
        pred_labels = []
        for i in range(len(inputs_valid)):  # number of sentences in the dataset
            # Forward pass
            outputs = model.forward(inputs_valid[i])
            _, predicted = torch.max(outputs, 1)
            # Add the true and predicted labels to their respective lists
            true_labels.append(labels_valid[i].tolist())
            pred_labels.append(predicted.tolist())

        true_labels_flat = [label for sublist in true_labels for label in sublist]
        pred_labels_flat = [label for sublist in pred_labels for label in sublist]
        
        print(classification_report(true_labels_flat, pred_labels_flat, zero_division=1))
        

Epoch 10/100, Loss: 0.8844480514526367
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     42759
           1       0.30      0.26      0.28      1842
           2       0.14      0.52      0.22      1307
           3       0.33      0.02      0.04      1341
           4       1.00      0.00      0.00       751
           5       0.59      0.32      0.41      1837
           6       1.00      0.00      0.00       257
           7       0.85      0.02      0.05       922
           8       1.00      0.00      0.00       346

    accuracy                           0.82     51362
   macro avg       0.68      0.23      0.22     51362
weighted avg       0.86      0.82      0.81     51362

Epoch 20/100, Loss: 0.9995336532592773
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     42759
           1       0.30      0.27      0.29      1842
           2       0.15      0.56      0.24      1307


In [123]:
true_labels = []
pred_labels = []
for i in range(len(inputs_test)):  # number of sentences in the dataset
    # Forward pass
    outputs = model.forward(inputs_test[i])
    _, predicted = torch.max(outputs, 1)
    # Add the true and predicted labels to their respective lists
    true_labels.append(labels_test[i].tolist())
    pred_labels.append(predicted.tolist())
            
true_labels_flat = [label for sublist in true_labels for label in sublist]
pred_labels_flat = [label for sublist in pred_labels for label in sublist]
        
print(classification_report(true_labels_flat, pred_labels_flat, zero_division=1))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94     38323
           1       0.31      0.29      0.30      1617
           2       0.15      0.61      0.24      1156
           3       0.42      0.09      0.15      1661
           4       0.33      0.05      0.08       835
           5       0.69      0.39      0.50      1668
           6       0.86      0.07      0.14       257
           7       0.44      0.10      0.17       702
           8       0.19      0.02      0.03       216

    accuracy                           0.83     46435
   macro avg       0.48      0.29      0.28     46435
weighted avg       0.85      0.83      0.82     46435

