In [5]:
#libraries
import pandas as pd 
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import sklearn.metrics as metrics

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load BERT model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Set max sequence length
MAX_SEQ_LENGTH = 128

class Model:
    def load_model(self, load_path):
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        checkpoint = torch.load(load_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.to(device)
        model.eval()
        print(f'Model loaded from <== {load_path}')
        return model

    # predict sentence label , for model 1, (0 prediction refers to bot, 1 human), 
   
  
    def predict_hate(self, model, sentence):
        tokens = tokenizer.encode_plus(
            sentence,
            max_length=MAX_SEQ_LENGTH,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt')
        tokens = tokens.to(device)
        with torch.no_grad():
            outputs = model(tokens['input_ids'], token_type_ids=None, attention_mask=tokens['attention_mask'])
        logits = outputs[0]
        _, predicted = torch.max(logits, dim=1)
        return predicted.item()

    def predict_proba(self, data):
    # Load Model and Evaluate, final out put would be (0 prediction refers to bot, 1 refers to human)
        model1 = self.load_model('model_1.pt')

        predictions=[]
        for post in data:
            result1=self.predict_hate(model1, post)
            if result1==0:
                predictions.append('bot')
            else:

                predictions.append('human')
        return np.array(predictions)

# Instantiate the model
model = Model()

In [7]:
# Read your test data (in your data you dont need label column)
test = pd.read_csv('poitifact_real_output_raw_id.csv')

## Clean the text as like ths. its important it has to be like this
test['full_text'] = test['full_text'].astype(str).str.lower()  # Convert text to lowercase
test['full_text'] = test['full_text'].str.replace(r'http\S+', 'http')  # Remove URLs while preserving "http"
test['full_text'] = test['full_text'].str.replace(r'[^\w\s#@]', '')  # Remove punctuation except hashtags and mention
test['full_text'] = test['full_text'].str.replace(r'\n', '')  # Remove newline characters
test['full_text'] = test['full_text'].str.replace(r'\r', '')  # Remove line breaks
test['full_text'] = test['full_text'].astype(str)

model1 = model.load_model('model_1.pt')

predictions=[]
count = 0
for post in test['full_text'][:3000]:
    result1=model.predict_hate(model1, post)
    if result1==0:
        predictions.append('bot')
    else:
        predictions.append('human')
    count = count + 1
    if count%100 == 0:
        print(str(count)+' done!!')
if count%100 == 0:
    predictions = np.array(predictions)

#predictions = model.predict_proba(test['full_text'][:100]) # sent your test data for prediction

# # you dont need this part since you dont have any label
# accuracy = metrics.classification_report(test['label'][:100], predictions, digits=3)
# print('Accuracy of model cascade: \n')
# print(accuracy)

  test['full_text'] = test['full_text'].str.replace(r'http\S+', 'http')  # Remove URLs while preserving "http"
  test['full_text'] = test['full_text'].str.replace(r'[^\w\s#@]', '')  # Remove punctuation except hashtags and mention
  test['full_text'] = test['full_text'].str.replace(r'\n', '')  # Remove newline characters
  test['full_text'] = test['full_text'].str.replace(r'\r', '')  # Remove line breaks
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a Be

Model loaded from <== model_1.pt
100 done!!
200 done!!
300 done!!
400 done!!
500 done!!
600 done!!
700 done!!
800 done!!
900 done!!
1000 done!!
1100 done!!
1200 done!!
1300 done!!
1400 done!!
1500 done!!
1600 done!!
1700 done!!
1800 done!!
1900 done!!
2000 done!!
2100 done!!
2200 done!!
2300 done!!
2400 done!!
2500 done!!
2600 done!!
2700 done!!
2800 done!!
2900 done!!
3000 done!!


In [8]:
result_df = pd.DataFrame()
result_df['Text'] = test['full_text'][:3000]
result_df['label'] = predictions

result_df

Unnamed: 0,Text,label
0,small business owners join the national federa...,bot
1,nearly a third of main street businesses say i...,human
2,hr704 new commemorating the 75th anniversary o...,bot
3,a record number of small business owners are s...,human
4,the state director of the national federation ...,bot
...,...,...
2995,national federation of independent business re...,human
2996,do you know about section 179 heres an infogra...,human
2997,in 2012 saturday shoppers spent an estimated 5...,human
2998,just in chief justice john roberts issues rare...,human


In [9]:
result_df.to_csv('poitifact_real_botometer.csv',index=False)

In [10]:
result_df['label'].value_counts()

human    1947
bot      1053
Name: label, dtype: int64