In [1]:
#Importing relevant libraries
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from transformers import BertModel
import torch.nn as nn
from transformers import AdamW
from tqdm import tqdm


In [2]:
features_df = pd.read_csv('features_dataset.csv')
features_df

Unnamed: 0,label,TTR,text_polarity,text_subjectivity,title_polarity,title_subjectivity,doc_perplexity,1_grams,average_word_length
0,1,0.844037,0.091481,0.524184,-0.125000,0.125000,0.867938,"['donald', 'trump', 'met', 'member', 'nato', '...",6.490826
1,0,0.683438,0.003001,0.343395,0.000000,0.000000,0.882860,"['washington', 'reuters', 'rick', 'perry', 'pr...",6.385744
2,1,0.721030,0.073622,0.401345,0.000000,1.000000,0.883760,"['president', 'obama', 'blasted', 'republican'...",6.072961
3,1,0.667638,0.037264,0.462935,0.000000,1.000000,0.882882,"['male', 'idaho', 'republican', 'daughter', 'c...",6.469388
4,1,0.628032,-0.018966,0.479310,-0.800000,0.900000,0.883120,"['kellyanne', 'conway', 'tried', 'spin', 'whit...",6.296496
...,...,...,...,...,...,...,...,...,...
13829,0,0.659631,-0.030698,0.380595,0.000000,0.000000,0.882810,"['washington', 'reuters', 'member', 'congress'...",6.514512
13830,1,0.629126,-0.009035,0.312617,-0.066667,0.633333,0.895444,"['far', 'video', '530000', 'view', 'make', 'co...",6.782524
13831,0,0.595745,0.049287,0.244648,0.000000,0.000000,0.883645,"['dec', '27', 'story', 'corrects', 'say', '550...",6.117021
13832,0,0.696581,-0.026939,0.293520,0.000000,0.000000,0.895276,"['madrid', 'reuters', 'spain', 'high', 'court'...",6.478632


# BERT feature model

## Splitting the data into training and validation sets

In [3]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['TTR', 'text_polarity', 'text_subjectivity', 'title_polarity', 'title_subjectivity', 'doc_perplexity', 'average_word_length']
features_df[numerical_features] = scaler.fit_transform(features_df[numerical_features])

# Tokenize text
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Because '1_grams' is in a string format, we need to tokenize to enable the models to understand the feature.
features_df['input_ids'] = features_df['1_grams'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))

# Split data
train_df, test_df = train_test_split(features_df, test_size=0.2, random_state=42)


In [4]:
class CustomBERTModel(nn.Module):
    def __init__(self, num_features):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size + num_features, 2)

    def forward(self, input_ids, attention_mask, additional_features):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        combined_features = torch.cat((pooled_output, additional_features), 1)
        combined_features = self.dropout(combined_features)
        logits = self.classifier(combined_features)
        return logits

# Initialize model
num_additional_features = len(numerical_features)
model = CustomBERTModel(num_additional_features)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Creating a training loop to train the models, and adding "additional features" to ensure that the model also handle the numerical values:

In [5]:
#Initializing a BERT pre-trained model
BERTmodel = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BERTmodel.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
#Creating data loaders to handle batching and shuffling the data for training and validation.
train_loader = DataLoader(train_df, batch_size=16, shuffle=True)
test_loader = DataLoader(test_df, batch_size=16)

In [7]:
# Convert the features to to PyTorch tensors

input_ids = list(features_df['input_ids'])
additional_features = torch.tensor(features_df[numerical_features].values, dtype=torch.float)
labels = torch.tensor(features_df['label'].values, dtype=torch.long)

# Create Dataset
class NewsDataset(Dataset):
    def __init__(self, texts, features, labels):
        self.texts = texts  # this should be a list of lists of input_ids
        self.features = features  # this should be a tensor of additional features
        self.labels = labels  # this should be a list of labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Ensure input_ids and attention_mask are long type
        input_ids = torch.tensor(self.texts[idx], dtype=torch.long)
        attention_mask = (input_ids != 0).long()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'additional_features': self.features[idx],
            'labels': self.labels[idx]
        }

# Initialize Dataset and DataLoader
dataset = NewsDataset(input_ids, additional_features, labels)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [8]:
#Below we define the training process, including forward and backward propagation.
"""def train_epoch(model, data_loader, optimizer, device):
    model.train()  # Set the model to training mode
    total_loss = 0
    criterion = nn.CrossEntropyLoss()  # Define the loss function

    for batch in tqdm(data_loader):
        # Move batch data to the same device as the model
        input_ids = batch['input_ids'].to(device).long()
        attention_mask = batch['attention_mask'].to(device).long()
        additional_features = batch['additional_features'].to(device)
        labels = batch['labels'].to(device)

        # Reset gradients
        optimizer.zero_grad()

        # Forward pass: compute predictions and loss
        logits = model(input_ids, attention_mask, additional_features)
        loss = criterion(logits, labels)  # Compute loss

        # Backward pass
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights

        # Accumulate the loss
        total_loss += loss.item()


    # Calculate the average loss
    average_loss = total_loss / len(data_loader)
    return average_loss"""


"def train_epoch(model, data_loader, optimizer, device):\n    model.train()  # Set the model to training mode\n    total_loss = 0\n    criterion = nn.CrossEntropyLoss()  # Define the loss function\n\n    for batch in tqdm(data_loader):\n        # Move batch data to the same device as the model\n        input_ids = batch['input_ids'].to(device).long()\n        attention_mask = batch['attention_mask'].to(device).long()\n        additional_features = batch['additional_features'].to(device)\n        labels = batch['labels'].to(device)\n\n        # Reset gradients\n        optimizer.zero_grad()\n\n        # Forward pass: compute predictions and loss\n        logits = model(input_ids, attention_mask, additional_features)\n        loss = criterion(logits, labels)  # Compute loss\n\n        # Backward pass\n        loss.backward()  # Compute gradients\n        optimizer.step()  # Update weights\n\n        # Accumulate the loss\n        total_loss += loss.item()\n\n\n    # Calculate the average

In [9]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device).long()  # Force long type
        attention_mask = batch['attention_mask'].to(device).long()  # Force long type
        additional_features = batch['additional_features'].to(device)
        labels = batch['labels'].to(device)

        print(f"Input IDs type: {input_ids.dtype}")  # Check data types
        print(f"Attention Mask type: {attention_mask.dtype}")
        print(f"Additional Features type: {additional_features.dtype}")
        print(f"Labels type: {labels.dtype}")

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, additional_features)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(data_loader)
    return average_loss


In [10]:
# Train the model (example for one epoch)
loss = train_epoch(model, train_loader, optimizer, device)
print(f"Training loss: {loss}")

  0%|          | 0/865 [00:00<?, ?it/s]


Input IDs type: torch.int64
Attention Mask type: torch.int64
Additional Features type: torch.float32
Labels type: torch.int64


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)