### Check whether gpu available or not

#### For Nvidia system

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)

if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
import torch

# Check if MPS (Metal Performance Shaders) is available for Apple Silicon
if torch.cuda.is_available():
    print('GPU is available for acceleration.')
    device = torch.device("cuda")  # Use MPS backend
else:
    print('GPU is not available. Using CPU.')
    device = torch.device("cpu")

print('Selected device:', device)


#### For Apple Silicon system

In [None]:
import torch

# Check if MPS (Metal Performance Shaders) is available for Apple Silicon
if torch.backends.mps.is_available():
    print('Metal is available for acceleration.')
    device = torch.device("mps")  # Use MPS backend
else:
    print('Metal is not available. Using CPU.')
    device = torch.device("cpu")

print('Selected device:', device)


### Import necessary libraries

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context


In [None]:
import pandas as pd
import evaluate
import numpy as np

from transformers import RobertaTokenizer, RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments

import torch
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from datasets import load_dataset
from torch.utils.data import Dataset
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk import pos_tag, word_tokenize, bigrams
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

### Initialize Roberta and Other Components

In [None]:
# Initialize the RoBERTa tokenizer and model
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Initialize CountVectorizer for n-gram feature extraction
vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords.words('english'))

### Importing the dataset

In [None]:
from datasets import load_dataset

data_files = {"train": "___Set the dataset path___",
              "valid": "___Set the dataset path___",
              "test": "___Set the dataset path___"
              }

data = load_dataset('csv', data_files = data_files)

data

In [None]:
data['train'].features
data['test'][0]

vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords.words('english'))
train_texts = [example['text'] for example in data['train']]
vectorizer.fit(train_texts)

### Creating the data class

In [None]:
class FakeReviewDataset(Dataset):
    def __init__(self, dataset,vectorizer):
        self.dataset = dataset
        self.vectorizer = vectorizer
        # CountVectorizer(ngram_range=(1, 2), stop_words=stopwords.words('english'), dtype=np.float32)

    def __len__(self):
        return len(self.dataset)


    def extract_linguistic_features(self, text):
        # Tokenize text
        tokens = word_tokenize(text)

        # Generate POS tags
        pos_tags = pos_tag(tokens)
        pos_tags_str = ['_'.join(tag) for tag in pos_tags]

        # Generate bigrams
        bigram_features = list(bigrams(tokens))
        bigrams_str = ['_'.join(bigram) for bigram in bigram_features]

        # Combine all features
        all_features = ' '.join(pos_tags_str + bigrams_str)
        return self.vectorizer.transform([all_features]).toarray()

    def __getitem__(self, idx):
        text = self.dataset['text'][idx]
        label = self.dataset['label'][idx]

        # Tokenize text
        encoded_input = tokenizer(text, truncation=True, max_length=128, padding='max_length', return_tensors='pt')
        input_ids = encoded_input['input_ids'].squeeze()
        attention_mask = encoded_input['attention_mask'].squeeze()

        # Extract linguistic features
        linguistic_features = self.extract_linguistic_features(text)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'linguistic_features': torch.tensor(linguistic_features, dtype=torch.float32).squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

### Creating Dataset Instances

In [None]:
train_dataset = FakeReviewDataset(data['train'], vectorizer)
test_dataset = FakeReviewDataset(data['test'], vectorizer)
valid_dataset = FakeReviewDataset(data['valid'], vectorizer)

### Padding the data

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Load the evaluation metrics

In [None]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Compute the metrics
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    precision_result = precision.compute(predictions=predictions, references=labels, average="binary")
    recall_result = recall.compute(predictions=predictions, references=labels, average="binary")
    f1_result = f1.compute(predictions=predictions, references=labels, average="binary")

    return {
        "accuracy": accuracy_result["accuracy"],
        "precision": precision_result["precision"],
        "recall": recall_result["recall"],
        "f1": f1_result["f1"]
    }

### Train model

In [None]:
training_args = TrainingArguments(
    output_dir='./model_output',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    fp16=False,  # Only set to True if your GPU supports FP16
    load_best_model_at_end=True  # Useful for automatically picking the best model
)

model = RobertaForSequenceClassification.from_pretrained('roberta-base')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    )


In [None]:
trainer.train()

### Evaluation

In [None]:
test_results = trainer.evaluate(test_dataset)
for k, v in test_results.items():
    print(k, ":", v)