### The inputfile is the outputfile of "Maritime_Annotation_method.ipynb",which is "final_severities_with_scores.csv"

In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report

# Force the use of CPU
device = torch.device("cpu")
print(f"Using device: {device}")

# Step 1: Load data and preprocess
file_path = "final_severities_with_scores.csv"  # Path to the CSV file
data = pd.read_csv(file_path)

# Extract relevant columns
texts = data['def_text'].fillna("missing").tolist()  # Filling missing values in 'def_text'
labels = data['Final Severity'].tolist()

# Encode text labels into numerical form
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Load the Sentence-BERT model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a different variable name for Sentence-BERT

# Generate text embeddings
def generate_embeddings(texts, model):
    print("Generating text embeddings...")
    embeddings = model.encode(texts, convert_to_tensor=True, show_progress_bar=True)
    return embeddings.cpu().numpy()  # Move tensor to CPU before converting to NumPy

embeddings = generate_embeddings(texts, sentence_model)

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

# Step 3: Define the neural network model
class SeverityClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(SeverityClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_dim = embeddings.shape[1]  # The dimension of the embedding vector
num_classes = len(label_encoder.classes_)  # Number of unique severity classes
model = SeverityClassifier(input_dim, num_classes)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Data loader settings
batch_size = 32

# Custom Dataset class for PyTorch DataLoader
class SeverityDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return torch.tensor(self.embeddings[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# Create DataLoader for training and testing
train_dataset = SeverityDataset(X_train, y_train)
test_dataset = SeverityDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Step 4: Train the model
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

# Evaluation function
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_preds.extend(predicted.numpy())
            all_labels.extend(labels.numpy())

    accuracy = 100 * correct / total
    print(f"Accuracy: {accuracy:.2f}%")
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

# Train the model
train_model(model, train_loader, criterion, optimizer, epochs=10)
evaluate_model(model, test_loader)

# Step 5: Save the trained model
model_path = "trained_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

# Step 6: Load the trained model for inference
model = SeverityClassifier(input_dim, num_classes)
model.load_state_dict(torch.load(model_path))
model.eval()  # Set the model to evaluation mode

# Step 7: Perform prediction on new test data
def predict(model, test_data):
    model.eval()
    test_tensor = torch.tensor(test_data, dtype=torch.float32)
    with torch.no_grad():
        outputs = model(test_tensor)
        predicted_labels = torch.argmax(outputs, dim=1).numpy()
    return predicted_labels

# Generate embeddings for the test dataset
test_file_path = "psc_severity_test.csv"  # Path to the test CSV file
test_data = pd.read_csv(test_file_path)
test_texts = test_data['def_text'].fillna("missing").tolist()

# Generate test embeddings
test_embeddings = generate_embeddings(test_texts, sentence_model)

# Perform prediction
predicted_labels = predict(model, test_embeddings)

# Map predicted numerical labels back to original severity labels
annotation_severity = label_encoder.inverse_transform(predicted_labels)

# Add the predictions as a new column to the test dataset
test_data['annotation_severity'] = annotation_severity

# Step 8: Save the updated dataset with predictions
output_file_path = "psc_severity_test_with_annotations.csv"
test_data.to_csv(output_file_path, index=False)

print(f"Annotated test data saved to {output_file_path}")


Using device: cpu
Generating text embeddings...


Batches:   0%|          | 0/61 [00:00<?, ?it/s]

Epoch 1/10, Loss: 1.0143
Epoch 2/10, Loss: 0.9501
Epoch 3/10, Loss: 0.9148
Epoch 4/10, Loss: 0.8810
Epoch 5/10, Loss: 0.8546
Epoch 6/10, Loss: 0.8393
Epoch 7/10, Loss: 0.8183
Epoch 8/10, Loss: 0.8019
Epoch 9/10, Loss: 0.7923
Epoch 10/10, Loss: 0.7704
Accuracy: 61.38%
Classification Report:


NameError: name 'classification_report' is not defined