In [6]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import numpy as np
import pandas as pd
import joblib
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

<h2>Load data and DistilBERT word embeddings</h2>

In [8]:
base_directory = "./data/"
base_model_directory = "./models/"
def get_distilbert_embeddings(data, tokenizer, model, batch_size=32, max_length=110):
    # Placeholder for the embeddings
    all_embeddings = []

    for i in range(0, len(data), batch_size):
        batch = data.iloc[i:i+batch_size]
        # Combine claim and evidence into one string per pair
        texts = list(batch['Claim'] + " [SEP] " + batch['Evidence'])
        inputs = tokenizer.batch_encode_plus(texts, padding='max_length', truncation=True, return_tensors="tf", max_length=max_length)

        # Generate embeddings
        outputs = model(inputs['input_ids'], attention_mask=inputs['attention_mask'])
        # Use the last_hidden_state so compatible with LSTM
        embeddings = outputs.last_hidden_state.numpy()
        all_embeddings.append(embeddings)


    # Concatenate all batch embeddings into a single array
    return np.vstack(all_embeddings)

def get_evaluation_metrics(y_true, y_pred):
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)

  print(f'Accuracy: {accuracy:.4f}')
  print(f'Precision: {precision:.4f}')
  print(f'Recall: {recall:.4f}')
  print(f'F1 Score: {f1:.4f}')

In [10]:
# Load test
test_data = pd.read_csv(base_directory + "test.csv") # Change later
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# Remove [REF] from evidence
test_data['Evidence'] = test_data['Evidence'].str.replace('\[REF\]', '', regex=True)

# Combine claims and evidence, then tokenize
combined_texts = test_data['Claim'] + ' ' + test_data['Evidence']
tokenized_lengths = combined_texts.apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))

# Create embeddings
test_embeddings_lr = get_distilbert_embeddings(test_data, tokenizer, model, max_length=110)
test_embeddings_lstm = get_distilbert_embeddings(test_data, tokenizer, model, max_length=70)
# Flatten embeddings
test_embeddings_lr_flat = test_embeddings_lr.reshape(test_embeddings_lr.shape[0], -1)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


<h2>Logistic Regression Model</h2>

In [11]:
# Load PKL Model
lr_model = joblib.load(base_model_directory + "lr_model.pkl")

# Predict
lr_predictions = lr_model.predict(test_embeddings_lr_flat)

# Store data into excel
predictions_df = pd.DataFrame(lr_predictions, columns=['prediction'])
predictions_df.to_csv('Group_1_A.csv', index=False)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


Accuracy: 0.8208
Precision: 0.6808
Recall: 0.6323
F1 Score: 0.6556


<h2>LSTM Model</h2>

In [5]:
# Load model
lstm_model = load_model(base_model_directory + "lstm_ref_aug")



TypeError: weight_decay is not a valid argument, kwargs should be empty  for `optimizer_experimental.Optimizer`.

In [None]:
# Predict
lstm_predictions = lstm_model.predict(test_embeddings_lstm)
predictions_binary = np.where(lstm_predictions > 0.5, 1, 0)

# Store data into excel
predictions_df = pd.DataFrame(lstm_predictions, columns=['prediction'])
predictions_df.to_csv('Group_1_B.csv', index=False)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.