In [2]:
import os
import warnings
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer, logging
from sklearn.model_selection import train_test_split
from tf_keras.callbacks import EarlyStopping

# Suppress warnings and logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
logging.set_verbosity_error()
warnings.filterwarnings("ignore", message=".*overflowing tokens are not returned.*")
print("All warnings and unnecessary logs suppressed successfully!")

# Load the cleaned dataset
data = pd.read_csv("recommendationDataset.csv")
print(f"Loaded cleaned dataset with {len(data)} samples.")

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Encode the inputs in batches to ensure consistent length
print("Encoding texts...")
encoded_inputs = tokenizer(
    data['citing_sentence'].tolist(),
    data['cited_paper_abstract'].tolist(),
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

# Convert the encoded inputs to NumPy arrays
input_ids = np.array(encoded_inputs['input_ids'])
attention_masks = np.array(encoded_inputs['attention_mask'])

print(f"Encoded input shape: {input_ids.shape}, Attention mask shape: {attention_masks.shape}")

# Labels (convert to numpy array)
labels = np.array(data['label'].astype(int).tolist())

# Split the data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)

train_masks, val_masks = train_test_split(
    attention_masks, test_size=0.2, random_state=42
)

# Create TensorFlow dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": train_inputs,
    "attention_mask": train_masks
}, train_labels)).batch(16)

val_dataset = tf.data.Dataset.from_tensor_slices(({
    "input_ids": val_inputs,
    "attention_mask": val_masks
}, val_labels)).batch(16)

# Load the pre-trained BERT model for binary classification
model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

# Create optimizer from Hugging Face
num_train_steps = len(train_dataset) * 5  # 5 epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)

# Compile the model using the created optimizer
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)

# Train the model
print("Training the model...")
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5,
    callbacks=[early_stopping]
)

# Save the model
model.save_pretrained("citationRecommendationModel")

2025-03-22 06:10:09.690469: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742638209.704511   21405 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742638209.710002   21405 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742638209.725883   21405 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742638209.725933   21405 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742638209.725936   21405 computation_placer.cc:177] computation placer alr

Loaded cleaned dataset with 26000 samples.
Encoding texts...


I0000 00:00:1742638258.712748   21405 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Encoded input shape: (26000, 128), Attention mask shape: (26000, 128)
Training the model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
