### Check whether gpu available or not

#### For Nvidia system

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)

if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
import torch

# Check if MPS (Metal Performance Shaders) is available for Apple Silicon
if torch.cuda.is_available():
    print('GPU is available for acceleration.')
    device = torch.device("cuda")  # Use MPS backend
else:
    print('GPU is not available. Using CPU.')
    device = torch.device("cpu")

print('Selected device:', device)


#### For Apple Silicon system

In [None]:
import torch

# Check if MPS (Metal Performance Shaders) is available for Apple Silicon
if torch.backends.mps.is_available():
    print('Metal is available for acceleration.')
    device = torch.device("mps")  # Use MPS backend
else:
    print('Metal is not available. Using CPU.')
    device = torch.device("cpu")

print('Selected device:', device)


### Import the required libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import get_linear_schedule_with_warmup, AdamW
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time, datetime, random, re, string
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import train_test_split
from collections import Counter
from transformers import BertModel, BertTokenizer
LABELS = ["Real  (1)", "Fake  (-1)"] # set your labels

SEED = 15
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

### Importing the datasets & dataset pre-processing

In [None]:
training_data = pd.read_csv('___Set file path___')
training_data

validation_data = pd.read_csv('___Set file path___')
validation_data

evaluation_data = pd.read_csv('___Set file path___')
evaluation_data

In [None]:
# Dropping unecessary columns for fake real review dataset

df_fk_rl = df_reviews_fk_rl.drop(['review_headline', 'cleaned_review_body'],  axis = 1)
df_fk_rl.drop_duplicates(keep=False,inplace=True)

column_mapping = {'fake_review': 'label', 'review_body': 'text'}
df_fk_rl = df_fk_rl.rename(columns=column_mapping)

df_fk_rl

In [None]:
# Plotting the data

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Sample data for the subplots
data = [df_cg_or, df_fk_rl, df_yelp]

# Labels for each subplot
labels = ["Computer Generated and Human Wriiten dataset", "Fake and real Review dataset", "Yelp dataset"]
LABELS = [["Real  (OR)", "Fake  (CG)"],["Real  (0)", "Fake  (1)"],["Real  (1)", "Fake  (0)"]]

for i, ax in enumerate(axes):
    cnt_classes = pd.value_counts(data[i]['label'], sort=True)
    cnt_classes.plot(kind='bar', rot=0, ax=ax)
    ax.set_title(labels[i])
    ax.set_xticks(range(2))
    ax.set_xticklabels(LABELS[i])
    ax.set_xlabel("Label")
    ax.set_ylabel("Frequency")

# Adjust the layout to prevent overlapping titles
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Down sampling the Fake and real review dataset
X = df_fk_rl.drop(['label'], axis=1)
y = df_fk_rl['label']

from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=1)
X_res, y_res = rus.fit_resample(X, y)

df_fk_rln = pd.concat([y_res, X_res],axis=1)
df_fk_rln

In [None]:
# Plotting the data

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Sample data for the subplots
data = [df_cg_or, df_fk_rln, df_yelp]

# Labels for each subplot
labels = ["Computer Generated and Human Wriiten dataset", "Fake and real Review dataset", "Yelp dataset"]
LABELS = [["Real  (OR)", "Fake  (CG)"],["Real  (0)", "Fake  (1)"],["Real  (1)", "Fake  (0)"]]

for i, ax in enumerate(axes):
    cnt_classes = pd.value_counts(data[i]['label'], sort=True)
    cnt_classes.plot(kind='bar', rot=0, ax=ax)
    ax.set_title(labels[i])
    ax.set_xticks(range(2))
    ax.set_xticklabels(LABELS[i])
    ax.set_xlabel("Label")
    ax.set_ylabel("Frequency")

# Adjust the layout to prevent overlapping titles
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Combine datasets

column_order = ['text', 'label']
df_cg_or = df_cg_or[column_order]
df_fk_rl = df_fk_rl[column_order]
df_yelp = df_yelp[column_order]

df_cg_or.loc[df_cg_or["label"] == 'CG', "label"] = 0
df_cg_or.loc[df_cg_or["label"] == 'OR', "label"] = 1

df_fk_rl.loc[df_fk_rl["label"] == 0, "label"] = 1
df_fk_rl.loc[df_fk_rl["label"] == 1, "label"] = 0

df= pd.concat([df_cg_or, df_fk_rln, df_yelp],ignore_index=True)
if df.isnull().values.any():
    # Handle NaN values (e.g., by dropping rows with NaN)
    df.dropna(inplace=True)
df.info

In [None]:
df.drop_duplicates(keep=False,inplace=True)
df.describe
data-new = df

### Instantiate BERT tokenizer 

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### Corpus tokenization using Bert

In [None]:
def corpus_tokenize(data, tokenizer, max_len):
    # For storing token ids
    input_ids = []
    # Storage for attention masks
    attention_masks = []
    #At max it can be 512
    max_len = max_len
    # Go through every document
    for df in data:
        encoded_dict = tokenizer.encode_plus(
                            df,  # document to encode.
                            add_special_tokens=True,  #  '[CLS]' and '[SEP]' token add
                            max_length=max_len,  # value for max length
                            truncation=True,  # if longer messages then truncate 
                            pad_to_max_length=True,  # add padding
                            return_attention_mask=True,  #  attn. masks creation
                            return_tensors='pt'  # pytorch tensor return
                       )

        # To include tokenized token in the list
        input_ids.append(encoded_dict['input_ids'])

        # Attention mask
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)


### Generate Tokenized Data

In [None]:
input_ids, attention_masks = corpus_tokenize(data_new['text'].values, tokenizer, 256)

### Label to tensor conversion

In [None]:
labels = torch.tensor(data_new['label'].values.astype(np.float32))

### Get tensor data ready

In [None]:
def prep_dataset(padded_tokens, attention_masks, target):
    # Ready the target for the np array
    target = np.array(target.values, dtype=np.int64).reshape(-1, 1)
    # Build tensor data sets
    tensor_data = TensorDataset(padded_tokens, attention_masks, torch.from_numpy(target))
    # 80% of data for train
    train_size = int(0.8 * len(data_new))
    # 20% of data
    val_size = len(data_new) - train_size
    # 50% of validation
    test_size = int(val_size - 0.5*val_size)
    # By choosing samples at random, split the dataset.
    train_data, val_data = random_split(tensor_data, [train_size, val_size])
    # Samples are chosen at random to divide validation
    val_data, test_data = random_split(val_data, [test_size, test_size]) #keep test_size+1 for yelp nyc

    return train_data, val_data, test_data


# Build tensor dataset
train_data, val_data, test_data = prep_dataset(input_ids,attention_masks,data_new['label'])

In [None]:
#  To count the target distribution inside the tensor dataset
def target_cnt(tensor_data):
    cnt0 = 0
    cnt1 = 0
    final = []
    # Each item in the tensor data set
    for dat in tensor_data:
        # for target = 0
        if dat[2].item() == 0:
            cnt0 += 1
        # for target = 1
        elif dat[2].item() == 1:
            cnt1 += 1
    final.append(cnt0)
    final.append(cnt1)
    return torch.tensor(final)

# For imbalanced classification prepare weighted sampling
def prep_sampler(target_tensor, tensor_data):
    #  class distributions [x, y]
    class_sample_cnt = target_cnt(tensor_data)
    # weight
    weight = 1. / class_sample_cnt.float()
    # For each observation in dataset produce weight
    samples_weight = torch.tensor([weight[t[2]] for t in tensor_data])
    # prepare sampler
    sampler = torch.utils.data.WeightedRandomSampler(weights=samples_weight,
                                                     num_samples=len(samples_weight),
                                                     replacement=True)
    return sampler


# samplers for train set 
train_sampler = prep_sampler(target_cnt(train_data), train_data)


# Function to return time 
def format_time(elapsed):
    '''
    hh:mm:ss
    '''
    time_elapsed = int(round((elapsed)))
    #  hh:mm:ss
    return str(datetime.timedelta(seconds=time_elapsed))

### Dataloaders

In [None]:
#  To count the target distribution inside the tensor dataset
def target_cnt(tensor_data):
    cnt0 = 0
    cnt1 = 0
    final = []
    # Each item in the tensor data set
    for dat in tensor_data:
        # for target = 0
        if dat[2].item() == 0:
            cnt0 += 1
        # for target = 1
        elif dat[2].item() == 1:
            cnt1 += 1
    final.append(cnt0)
    final.append(cnt1)
    return torch.tensor(final)

# For imbalanced classification prepare weighted sampling
def prep_sampler(target_tensor, tensor_data):
    #  class distributions [x, y]
    class_sample_cnt = target_cnt(tensor_data)
    # weight
    weight = 1. / class_sample_cnt.float()
    # For each observation in dataset produce weight
    samples_weight = torch.tensor([weight[t[2]] for t in tensor_data])
    # prepare sampler
    sampler = torch.utils.data.WeightedRandomSampler(weights=samples_weight,
                                                     num_samples=len(samples_weight),
                                                     replacement=True)
    return sampler


# samplers for train set 
train_sampler = prep_sampler(target_cnt(train_data), train_data)


# Function to return time 
def format_time(elapsed):
    '''
    hh:mm:ss
    '''
    time_elapsed = int(round((elapsed)))
    #  hh:mm:ss
    return str(datetime.timedelta(seconds=time_elapsed))

### BERT-CNN Model

In [None]:
class BERTCNN(nn.Module):

    def __init__(self, config):
        super().__init__()
        # Kernels
        output_channel = config.output_channel  
        # Labels
        num_label = config.num_label  
        # Dropout value
        dropout = config.dropout  
        # length of embedding dim
        embed_dim = config.embed_dim  
        # conv net
        cn = 3  # conv net

        
        input_channel = 4  

        # 3 Convolutional nets
        self.conv1 = nn.Conv2d(input_channel, output_channel, (3, embed_dim), padding=(2, 0), groups=4)
        self.conv2 = nn.Conv2d(input_channel, output_channel, (4, embed_dim), padding=(3, 0), groups=4)
        self.conv3 = nn.Conv2d(input_channel, output_channel, (5, embed_dim), padding=(4, 0), groups=4)

        # Dropout
        self.dropout = nn.Dropout(dropout)

        # Fully connected layer for classification
        self.fc1 = nn.Linear(cn * output_channel, num_label)

    def forward(self, x, **kwargs):
        # squeeze to get size
        x = [F.relu(self.conv1(x)).squeeze(3), F.relu(self.conv2(x)).squeeze(3), F.relu(self.conv3(x)).squeeze(3)]
        # Max-over-time pooling
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        # Concat results
        x = torch.cat(x, 1)
        # Add Dropout
        x = self.dropout(x)
        # Generate logits which contains batch and target size
        logit = self.fc1(x)
        return logit

### Training function

In [None]:
def train(model, dataloader, optimizer):

    # Record Time
    total_t = time.time()

    # For one full pass for training set
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('..........Training..........')

    #After calculating loss reset total loss for epoch
    train_total_loss = 0
    train_total_f1 = 0

    # Training mode
    model.train()
    hyb_model.train()

    # Training data for each batch
    for itr, batch in enumerate(dataloader):

        # Updation after every 40 batches
        if itr % 40 == 0 and not itr == 0:

            # Progress
            print('  Batch {:>5,}  of  {:>5,}.'.format(itr, len(dataloader)))

        # This training batch has been unpacking from dataloader
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device).long()

        # To remove any previously calculated gradients.
        optimizer.zero_grad()

        # Executes an autocasting forward pass.
        with autocast():
            # Advance propagation (evaluate model on training batch)
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_input_mask)
            #Discover hidden layers and stack the layers
            hidden_layers = outputs[2]  
            # stack the layers
            hidden_layers = torch.stack(hidden_layers, dim=1)  
            # Last 4 layers
            hidden_layers = hidden_layers[:, -4:]  

        logits = hyb_model(hidden_layers)

        loss = criterion(logits.view(-1, 2), batch_labels.view(-1))

        # Sum the training loss over all batches to calculate average loss.
        train_total_loss += loss.item()

        # In order to produce scaled gradients
        scaler.scale(loss).backward()

         # Unscales the gradients of the optimizer's assigned parameters.
        scaler.step(optimizer)

        # Next iteration's scale update
        scaler.update()

        # Refresh the scheduler
        scheduler.step()

        # Determine preds
        _, pred = torch.max(logits, 1)

        # Move logits and labels to CPU
        pred = pred.detach().cpu().numpy()
        y_true = batch_labels.detach().cpu().numpy()

        # F1
        train_total_f1 += f1_score(pred, y_true,
                                   average='weighted',
                                   labels=np.unique(pred))

    # Average loss for all the batches 
    train_avg_loss = train_total_loss / len(dataloader)

    # Calculate the average f1 over all of the batches
    train_avg_f1 = train_total_f1 / len(dataloader)

    # Length of training
    train_time = format_time(time.time() - total_t)

    # The necessary statistics for the current epoch
    stats_train.append(
        {
            'Train Loss': train_avg_loss,
            'Train F1': train_avg_f1,
            'Train Time': train_time
        }
    )

    # Result
    print("")
    print("..........Summary..........")
    print("epoch | trn loss | trn f1 | trn time ")
    print(f"{epoch+1:5d} | {train_avg_loss:.5f} | {train_avg_f1:.5f} | {train_time:}")

    return None

### Validation function

In [None]:
def validating(model, dataloader):

    # Record Time
    total_t = time.time()

    # After each training cycle is over, evaluate performance against the validation set.
    print("")
    print("..........Running Validation..........")

    # Place the evaluation mode on both models.
    model.eval()
    hyb_model.eval()

    # Monitor variables
    val_total_accuracy = 0
    val_total_loss = 0
    val_total_f1 = 0
    val_total_recall = 0
    val_total_precision = 0
    val_bert_total_loss = 0

    # Analyze information for one epoch.
    for batch in dataloader:

        # This training batch has been unpacked from dataloader:
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device).long()

        
        with torch.no_grad():
            # Model evaluation using training batch
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_input_mask)

            hidden_layers = outputs[2]  

            hidden_layers = torch.stack(hidden_layers, dim=1) 

            hidden_layers = hidden_layers[:, -4:] 

        logits = hyb_model(hidden_layers)

        loss = criterion(logits.view(-1, 2), batch_labels.view(-1))

        # Add up validation losses
        val_total_loss += loss.item()

        # Determine preds
        _, pred = torch.max(logits, 1)

        
        pred = pred.detach().cpu().numpy()
        y_true = batch_labels.detach().cpu().numpy()

        #F1
        val_total_f1 += f1_score(pred, y_true,
                                   average='weighted',
                                   labels=np.unique(pred))

        # Determine accuracy
        val_total_accuracy += accuracy_score(pred, y_true)

        # Determine precision
        val_total_precision += precision_score(pred, y_true,
                                                 average='weighted',
                                                 labels=np.unique(pred))

        # Determine recall
        val_total_recall += recall_score(pred, y_true,
                                                 average='weighted',
                                                 labels=np.unique(pred))

    # Report the validation run's final accuracy.
    val_avg_accuracy = val_total_accuracy / len(dataloader)

    # Report the validation run's final f1
    global val_avg_f1
    val_avg_f1 = val_total_f1 / len(dataloader)

    # Report the validation run's final precision
    val_avg_precision = val_total_precision / len(dataloader)

    # Report the validation run's final recall
    val_avg_recall = val_total_recall / len(dataloader)

    # Determine the average loss for all batches.
    global val_avg_loss
    val_avg_loss = val_total_loss / len(dataloader)

    # Record end time for validation.
    val_time = format_time(time.time() - total_t)

    # Keeps track of all data from this epocs
    stats_val.append(
        {
            'Val Loss': val_avg_loss,
            'Val Accur.': val_avg_accuracy,
            'Val precision': val_avg_precision,
            'Val recall': val_avg_recall,
            'Val F1': val_avg_f1,
            'Val Time': val_time
        }
    )

    # Result
    print("")
    print("..........Summary..........")
    print("epoch | val loss | val f1 | val time")
    print(f"{epoch+1:5d} | {val_avg_loss:.5f} | {val_avg_f1:.5f} | {val_time:}")

    return None

### Evaluation function

In [None]:
def testing(model, dataloader):

    print("")
    print("..........Running Testing..........")

    # Record test time
    total_t = time.time()

    # Evaluation mode
    model.eval()
    hyb_model.eval()

   
    test_total_accuracy = 0
    test_total_loss = 0
    test_total_f1 = 0
    test_total_recall = 0
    test_total_precision = 0

    
    for batch in dataloader:

        
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device).long()

        
        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_input_mask)

            hidden_layers = outputs[2]  

            hidden_layers = torch.stack(hidden_layers, dim=1)  

            hidden_layers = hidden_layers[:, -4:]  

        logits = hyb_model(hidden_layers)

        loss = criterion(logits.view(-1, 2), batch_labels.view(-1))

        # Add up validation losses
        test_total_loss += loss.item()

        # Determine preds
        _, pred = torch.max(logits, 1)

        
        pred = pred.detach().cpu().numpy()
        y_true = batch_labels.detach().cpu().numpy()

        # Determine f1
        test_total_f1 += f1_score(pred, y_true,
                                   average='weighted',
                                   labels=np.unique(pred))

        # Determine accuracy
        test_total_accuracy += accuracy_score(pred, y_true)

        # Determine precision
        test_total_precision += precision_score(pred, y_true,
                                                 average='weighted',
                                                 labels=np.unique(pred))

        # Determine recall
        test_total_recall += recall_score(pred, y_true,
                                                 average='weighted',
                                                 labels=np.unique(pred))

    # Report the test's final accuracy
    test_avg_accuracy = test_total_accuracy / len(dataloader)

    # Report the test's final f1
    test_avg_f1 = test_total_f1 / len(dataloader)

    # Report the test's final precision
    test_avg_precision = test_total_precision / len(dataloader)

    # Report the test's final recall
    test_avg_recall = test_total_recall / len(dataloader)

    # Determine the cumulative loss for all batches.
    test_avg_loss = test_total_loss / len(dataloader)

    # Time for testing
    test_time = format_time(time.time() - total_t)

    # Keep track of all data from this epoch.
    stats_test.append(
        {
            'Test Loss': test_avg_loss,
            'Test Accur.': test_avg_accuracy,
            'Test precision': test_avg_precision,
            'Test recall': test_avg_recall,
            'Test F1': test_avg_f1,
            'Test Time': test_time
        }
    )
    # Result
    print("")
    print("..........Summary..........")
    print("epoch | test loss | test f1 | test time")
    print(f"{epoch+1:5d} | {test_avg_loss:.5f} | {test_avg_f1:.5f} | {test_time:}")

    return None

### Initialize BERT model and configure it

In [None]:
class config:
    def __init__(self):
        config.num_label = 2  # binary
        config.output_channel = 16  # number of kernels
        config.embed_dim = 768  # embed dimension
        config.dropout = 0.4  # dropout value
        return None


# Create config
config1 = config()

# Instantiate CNN
hyb_model = BERTCNN(config1).to(device)

# set loss
criterion = nn.CrossEntropyLoss()

# Number of epochs
epochs = 3

# Train the final four layers
BERT_param = []
layers_used = [11, 10, 9, 8]

for name, param in model.named_parameters():
    for layer_num in layers_used:
        layer_num = str(layer_num)
        if ".{}.".format(layer_num) in name:
            BERT_param.append(param)

# Setup optimizer
optimizer = AdamW([{'params': BERT_param, 'lr': 2e-5}], weight_decay=1.0)


# Setup LR scheduler
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

# Developing a gradient scaler for mixed precision
scaler = GradScaler()

### Training the model

In [None]:
# Storing of training results
stats_train = []
stats_val = []
val_loss_best = float('inf')

for epoch in range(epochs):
    #Train
    train(model, train_dataloader, optimizer)
    # Validate
    validating(model, valid_dataloader)
    # Verify validation loss
    if stats_val[epoch]['Val Loss'] < val_loss_best:
        best_valid_loss = stats_val[epoch]['Val Loss']
        # Save the model for later usage.
        torch.save(model.state_dict(), 'bert-cnn-model1.pt')  
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained('./model_save/bert-cnn/')  
        tokenizer.save_pretrained('./model_save/bert-cnn/')  

In [None]:
# Display Results
pd.set_option('display.precision', 3)
train_data_stats = pd.DataFrame(data=stats_train)
valid_data_stats = pd.DataFrame(data=stats_val)
stats_data = pd.concat([train_data_stats, valid_data_stats], axis=1)
stats_data.insert(0, 'Epoch', range(1, len(stats_data)+1))
stats_data = stats_data.set_index('Epoch')
stats_data

### Evaluating the model

In [None]:
stats_test = []
model.load_state_dict(torch.load('bert-cnn-model1.pt'))
testing(model, test_dataloader)
test_data_stats = pd.DataFrame(data=stats_test)
test_data_stats