In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

# Step 1: Load and preprocess the data as before

# Define and load your DataFrames, as you already did:
entities_df = pd.read_csv('entities_train.csv', sep='\t')
abstracts_df = pd.read_csv('abstracts_train.csv', sep='\t')
relations_df = pd.read_csv('relations_train.csv', sep='\t')

# Ensure proper data formatting
entities_df['abstract_id'] = entities_df['abstract_id'].astype(str)
abstracts_df['abstract_id'] = abstracts_df['abstract_id'].astype(str)
relations_df['abstract_id'] = relations_df['abstract_id'].astype(str)

# Step 2: Modify preprocessing to handle two tasks
# Define entity detection and relationship extraction tasks

# Tokenization function
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def insert_entity_tokens(row):
    abstract = row['abstract']
    e1_mention = row['entity_1_mention']
    e2_mention = row['entity_2_mention']
    
    # Insert unique markers for each entity
    abstract = abstract.replace(e1_mention, f"[ENTITY_1]{e1_mention}[/ENTITY_1]")
    abstract = abstract.replace(e2_mention, f"[ENTITY_2]{e2_mention}[/ENTITY_2]")
    return abstract

# Add token markers
merged_df['processed_abstract'] = merged_df.apply(insert_entity_tokens, axis=1)

# Step 3: Prepare datasets for both entity detection and relationship prediction
class EntityDetectionDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.encodings)

# Create entity detection task dataset
entity_labels = merged_df[['entity_1_type', 'entity_2_type']]
merged_df['processed_abstract'] = merged_df.apply(insert_entity_tokens, axis=1)

X_entity = merged_df['processed_abstract']
y_entity = entity_labels

X_train, X_temp, y_train, y_temp = train_test_split(X_entity, y_entity, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, random_state=42)

# Tokenize the dataset for entity detection
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding="max_length", max_length=200)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding="max_length", max_length=200)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding="max_length", max_length=200)

train_entity_dataset = EntityDetectionDataset(train_encodings, y_train.tolist())
val_entity_dataset = EntityDetectionDataset(val_encodings, y_val.tolist())
test_entity_dataset = EntityDetectionDataset(test_encodings, y_test.tolist())

# Step 4: Relationship prediction dataset
# We already have a column 'relationship_label' in merged_df
X_relation = merged_df['processed_abstract']
y_relation = merged_df['relationship_label']

X_train, X_temp, y_train, y_temp = train_test_split(X_relation, y_relation, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, random_state=42)

# Tokenize the dataset for relationship prediction
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding="max_length", max_length=200)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding="max_length", max_length=200)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding="max_length", max_length=200)

train_relation_dataset = EntityDetectionDataset(train_encodings, y_train.tolist())
val_relation_dataset = EntityDetectionDataset(val_encodings, y_val.tolist())
test_relation_dataset = EntityDetectionDataset(test_encodings, y_test.tolist())

# Step 5: Define models
# Model for entity detection
entity_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(entity_labels.columns))

# Model for relationship prediction
relationship_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_))

# Step 6: Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Step 7: Setup training arguments for both tasks
training_args_entity = TrainingArguments(
    output_dir='./entity_results', 
    eval_strategy="epoch", 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    num_train_epochs=10, 
    logging_dir='./logs',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

training_args_relation = TrainingArguments(
    output_dir='./relation_results', 
    eval_strategy="epoch", 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    num_train_epochs=10, 
    logging_dir='./logs',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
)

# Step 8: Train entity detection model
trainer_entity = Trainer(
    model=entity_model,
    args=training_args_entity,
    train_dataset=train_entity_dataset,
    eval_dataset=val_entity_dataset,
    compute_metrics=compute_metrics
)

trainer_entity.train()

# Step 9: Train relationship prediction model
trainer_relation = Trainer(
    model=relationship_model,
    args=training_args_relation,
    train_dataset=train_relation_dataset,
    eval_dataset=val_relation_dataset,
    compute_metrics=compute_metrics
)

trainer_relation.train()

# Step 10: Evaluate models
eval_entity_results = trainer_entity.evaluate()
eval_relation_results = trainer_relation.evaluate()

# Step 11: Save models
entity_model.save_pretrained('./entity_detection_model')
relationship_model.save_pretrained('./relationship_model')
tokenizer.save_pretrained('./relationship_model')

# Save the fitted LabelEncoder to a file
joblib.dump(label_encoder, 'label_encoder.pkl')

# Evaluate results and plot confusion matrices as you did before


2024-11-18 02:12:39.615709: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-18 02:12:39.635437: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-18 02:12:39.641096: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-18 02:12:39.658087: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


NameError: name 'merged_df' is not defined