In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import classification_report





In [2]:
# Load the embeddings from the .npy file
sentence_embeddings_array = np.load('/Users/jinlinchen/Documents/Study/HWR Berlin/Semester 2/Analytics Lab/Analytics Project/Database Part/data preprocessing for manual data/sentence_embeddings.npy')
entity_embeddings_array = np.load('/Users/jinlinchen/Documents/Study/HWR Berlin/Semester 2/Analytics Lab/Analytics Project/Database Part/data preprocessing for manual data/entity_embeddings.npy')

# Load the labels
df = pd.read_csv('/Users/jinlinchen/Documents/Study/HWR Berlin/Semester 2/Analytics Lab/Analytics Project/Database Part/manual_label - consolidated.csv')  # Adjust this line based on your actual data format
labels = df['class_ID'].tolist()

# Convert labels to be 0-indexed instead of 1-indexed because of the model's requirements
labels = [label - 1 for label in labels]

In [3]:
# Set random seed for reproducibility
random_seed = 42
np.random.seed(random_seed)
import torch
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)
    
# Split the dataset into training and test sets
train_sentence_embeddings, test_sentence_embeddings, train_entity_embeddings, test_entity_embeddings, train_labels, test_labels = train_test_split( 
    sentence_embeddings_array, entity_embeddings_array, labels, test_size=0.2, random_state=random_seed)

In [4]:
# Define a Dataset class for creating the datasets for training and testing
class EntityDataset(Dataset):
    def __init__(self, sentence_embeddings, entity_embeddings, labels):
        self.sentence_embeddings = sentence_embeddings
        self.entity_embeddings = entity_embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sentence_embedding = torch.tensor(self.sentence_embeddings[idx], dtype=torch.float)
        entity_embedding = torch.tensor(self.entity_embeddings[idx], dtype=torch.float)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return {
            "sentence_embedding": sentence_embedding,
            "entity_embedding": entity_embedding,
            "label": label
        }

# Create datasets
train_dataset = EntityDataset(train_sentence_embeddings, train_entity_embeddings, train_labels)
test_dataset = EntityDataset(test_sentence_embeddings, test_entity_embeddings, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

In [5]:
# Define a neural network model using PyTorch. 
# The model is designed to combine sentence embeddings and entity embeddings and then classify the class of the entity.

class EntityClassifier(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(EntityClassifier, self).__init__()
        self.fc = nn.Linear(input_dim * 2, num_labels)

    def forward(self, sentence_embedding, entity_embedding):
        combined_embedding = torch.cat((sentence_embedding, entity_embedding), dim=1)
        logits = self.fc(combined_embedding)
        return logits


input_dim = sentence_embeddings_array.shape[1]
model = EntityClassifier(input_dim=input_dim, num_labels=len(np.unique(labels)))

# Determine to use CPU or GPU, sometimes GPU is not available or the memory is not enough. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

EntityClassifier(
  (fc): Linear(in_features=1536, out_features=3, bias=True)
)

In [6]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Define a training function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    for batch in dataloader:
        sentence_embedding = batch["sentence_embedding"].to(device)
        entity_embedding = batch["entity_embedding"].to(device)
        labels = batch["label"].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(sentence_embedding, entity_embedding)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()

# Traine model
for epoch in range(3):  # Train for 3 epochs
    train(model, train_loader, optimizer, criterion, device)

In [7]:
# Difine function for evaluation
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            sentence_embedding = batch["sentence_embedding"].to(device)
            entity_embedding = batch["entity_embedding"].to(device)
            labels = batch["label"].to(device)
            
            outputs = model(sentence_embedding, entity_embedding)
            preds = torch.argmax(outputs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    print(classification_report(all_labels, all_preds))

# Evaluate the model on the test set
evaluate(model, test_loader, device)

              precision    recall  f1-score   support

           0       1.00      0.06      0.11        18
           1       0.00      0.00      0.00         9
           2       0.38      0.62      0.47        24

    accuracy                           0.31        51
   macro avg       0.46      0.23      0.19        51
weighted avg       0.53      0.31      0.26        51

