# no use

# new test

**Working code:**

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np
import pickle
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report
from datasets import load_dataset
from torch.utils.data import TensorDataset
import gensim.downloader as api

# Define the RNNModel class
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_dim, output_dim, n_layers=1, dropout=0.3):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            dropout=dropout,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_vectors):
        rnn_out, _ = self.rnn(input_vectors)
        output = self.fc(rnn_out)
        return output

# Corrected function for preparing data
def prepare_data(dataset, word2vec_model, label_encoding, max_len=128):
    input_vectors = []
    labels = []
    
    for data in dataset:
        tokens = data['tokens']
        ner_tags = data['ner_tags']
        
        # Convert tokens to word vectors
        word_vectors = []
        for token in tokens:
            if token in word2vec_model:
                word_vectors.append(word2vec_model[token])
            else:
                # Handle out-of-vocabulary words
                word_vectors.append(np.zeros(word2vec_model.vector_size))
        
        # Pad or truncate word vectors to max_len
        if len(word_vectors) > max_len:
            word_vectors = word_vectors[:max_len]
        else:
            pad_length = max_len - len(word_vectors)
            word_vectors.extend([np.zeros(word2vec_model.vector_size)] * pad_length)
        
        input_vectors.append(np.array(word_vectors))  # Convert to numpy array
        
        # Convert NER tags to numerical labels, padding or truncating to max_len
        numerical_tags = [label_encoding.get(tag, -1) for tag in ner_tags]
        if len(numerical_tags) > max_len:
            numerical_tags = numerical_tags[:max_len]
        else:
            numerical_tags += [-1] * (max_len - len(numerical_tags))  
        
        labels.append(torch.tensor(numerical_tags))
    
    # Convert lists to tensors
    input_vectors = torch.tensor(input_vectors, dtype=torch.float32) 
    labels = torch.stack(labels)
    
    return TensorDataset(input_vectors, labels) 

def train_model(model, train_loader, num_epochs=10, lr=0.001):
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for input_vectors, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(input_vectors)
            logits_flat = outputs.view(-1, outputs.shape[-1])
            labels_flat = labels.view(-1)
            loss = criterion(logits_flat, labels_flat)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

def evaluate_model(model, test_loader, label_encoding):
    model.eval()
    predictions = []
    ground_truth = []
    with torch.no_grad():
        for input_vectors, labels in test_loader:
            outputs = model(input_vectors)
            preds = torch.argmax(outputs, dim=-1)
            predictions.append(preds.cpu().numpy())
            ground_truth.append(labels.cpu().numpy())
    flat_predictions = np.concatenate([arr.flatten() for arr in predictions])
    flat_ground_truth = np.concatenate([arr.flatten() for arr in ground_truth])
    valid_indices = flat_ground_truth != -1
    filtered_predictions = flat_predictions[valid_indices]
    filtered_ground_truth = flat_ground_truth[valid_indices]
    reverse_label_encoding = {v: k for k, v in label_encoding.items()}
    filtered_predictions_labels = [reverse_label_encoding[pred] for pred in filtered_predictions]
    filtered_ground_truth_labels = [reverse_label_encoding[gt] for gt in filtered_ground_truth]
    print(classification_report(
        filtered_ground_truth_labels,
        filtered_predictions_labels,
        zero_division=0
    ))

# Initialize the Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

# Load the dataset
dataset = load_dataset("surrey-nlp/PLOD-CW")
train_dataset = dataset['train']
#validation_dataset = dataset['validation']
#test_dataset = dataset['test']

# Define label encoding
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

# Prepare the data
train_data = prepare_data(train_dataset, word2vec_model, label_encoding)
#validation_data = prepare_data(validation_dataset, word2vec_model, label_encoding)
#test_data = prepare_data(test_dataset, word2vec_model, label_encoding)

# Create DataLoaders
train_loader = data.DataLoader(train_data, batch_size=16, shuffle=True)
#validation_loader = data.DataLoader(validation_data, batch_size=16, shuffle=False)
#test_loader = data.DataLoader(test_data, batch_size=16, shuffle=False)

# Define model parameters
input_size = word2vec_model.vector_size
hidden_dim = 128
output_dim = len(label_encoding)

# Create and train the model
model = RNNModel(input_size, hidden_dim, output_dim)
train_model(model, train_loader, num_epochs=10)

# Evaluate the model
#evaluate_model(model, test_loader, label_encoding)

Epoch 1, Loss: 44.9711
Epoch 2, Loss: 32.7199
Epoch 3, Loss: 29.2735
Epoch 4, Loss: 27.8323
Epoch 5, Loss: 26.8528
Epoch 6, Loss: 25.8607
Epoch 7, Loss: 25.2012
Epoch 8, Loss: 24.5287
Epoch 9, Loss: 23.5397
Epoch 10, Loss: 23.2615
              precision    recall  f1-score   support

        B-AC       0.72      0.54      0.62       270
        B-LF       0.27      0.11      0.16       150
         B-O       0.90      0.95      0.93      4292
        I-LF       0.39      0.27      0.32       288

    accuracy                           0.87      5000
   macro avg       0.57      0.47      0.51      5000
weighted avg       0.84      0.87      0.85      5000



In [27]:
def save_model(model, word2vec_model, label_encoding, file_path):
    model_data = {
        'input_size': model.rnn.input_size,
        'hidden_dim': model.rnn.hidden_size,
        'output_dim': model.fc.out_features,
        'n_layers': model.rnn.num_layers,
        'dropout': model.rnn.dropout,
        'state_dict': model.state_dict(),
        'word2vec_model': word2vec_model,
        'label_encoding': label_encoding
    }
    with open(file_path, 'wb') as f:
        pickle.dump(model_data, f)

In [28]:
save_model(model, word2vec_model, label_encoding, 'model.pkl')

In [None]:
#def load_model(file_path):
 #   with open(file_path, 'rb') as f:
  #      model_data = pickle.load(f)
#
 #   # Create the RNNModel instance
  #  model = RNNModel(model_data['input_size'], model_data['hidden_dim'], model_data['output_dim'])
#
 #   # Load the state_dict
  #  model.load_state_dict(model_data['state_dict'])
#
 #   # Retrieve word2vec_model and label_encoding
  #  word2vec_model = model_data['word2vec_model']
   # label_encoding = model_data['label_encoding']
#
 #   return model, word2vec_model, label_encoding


In [30]:
#model, word2vec_model, label_encoding = load_model('model.pkl')
#example_tokens = [ "KO", ",", "knockout", ";", "PSD", ",", "postsynaptic", "density", "." ]
#predicted_labels = predict(example_tokens, model, word2vec_model, label_encoding)
#print(predicted_labels)

['B-AC', 'B-O', 'B-LF', 'B-O', 'B-AC', 'B-O', 'B-LF', 'I-LF', 'B-O']
