### This notebook shows the classification of fake papers based on full text features 

Each cell has a description above it. If there are doubts in any parts of the code please contact at (ahmar.hussain@ovgu.de)

Classification with full text features i.e. readability scores, percentage of active voice, clause density, Type-Token Ratio etc.

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier


df_combined = pd.read_csv('D:\\new_dataset\\full text just grammar features.csv')

X = df_combined.drop(columns = ['PMID', 'DOI', 'target_variable'])
y = df_combined.target_variable




X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 2022,
    stratify = y
)



rf_mixed = GradientBoostingClassifier(min_samples_split = 10, n_estimators = 300, learning_rate= 0.2)

rf_mixed.fit(X_train, y_train)

y_pred = rf_mixed.predict(X_test)

print(classification_report(y_test, y_pred))

summarizing full text and classifying based on BERT embeddings of full text

In [4]:
embeddings_dataframe

In [None]:
import os
import lxml.etree as ET
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel, BartTokenizer, BartForConditionalGeneration
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


BERT_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BERT_model = BertModel.from_pretrained('bert-base-uncased')

summarize_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
summarize_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')


#summarizing function
def summarizing_text(text, max = 150, min = 50):
    inputs = summarize_tokenizer(text, return_tensors = 'pt', max_length = 1024, truncation = True)
    summary_ids = summarize_model.generate(inputs['input_ids'], max_length = max, min_length = min, length_penalty=2.0, num_beams=4, early_stopping=True)
    summarized_text = summarize_tokenizer.decode(summary_ids[0], skip_special_tokens = True)
    return summarized_text

#produce embeddings
def embedding_text(text):
    inputs = BERT_tokenizer(text, return_tensors = 'pt', max_length = 512, truncation = True, padding = 'max_length')
    outputs = BERT_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach.numpy()
    return embeddings.flatten()


def text_extraction(xml_path):

    try:
        tree = ET.parse(path)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

        #extracting text after abstract uptill references

        abstract = root.find('.//tei:abstract', namespaces=ns)
        references = root.find('.//tei:div[@type="references"]', namespaces=ns)

        if abstract is not None and references is not None:
            abstract_index = list(root.iter()).index(abstract)
            reference_index = list(root.iter()).index(references)
            text = []
            for elem in list(root.iter())[abstract_index:reference_index]:
                text.append(elem.text or "")
            return " ".join(text)
        else:
            return None
    except Exception as e:
        return None
    
# Directories
nonfakes_directory = "D:\\new_dataset\\full text xmls\\nonfakes"    #These files can be found in the folder 'full text xmls' in the github repository
fakes_directory = "D:\\new_dataset\\full text xmls\\fakes"


data = {
    'Text': [],
    'Label': []
}


#going through non fakes
for xml in os.listdir(nonfakes_directory):
    if xml.endswith('.xml'):
        xml_path = os.path.join(nonfakes_directory, xml)
        xml_text = text_extraction(xml)
        if xml_text is not None:
            #summarize the extracted text
            summarized_text = summarizing_text(xml_text)
            data['Text'].append(summarized_text)
            data['Label'].append(0)


#going through fakes
for xml in os.listdir(fakes_directory):
    if xml.endswith('.xml'):
        xml_path = os.path.join(fakes_directory, xml)
        xml_text = text_extraction(xml)
        if xml_text is not None:
            #summarize the extracted text
            summarized_text = summarizing_text(xml_text)
            data['Text'].append(summarized_text)
            data['Label'].append(1)


#converting to dataframe
dataframe = pd.DataFrame(data)

encoder = LabelEncoder()
dataframe['Label'] = encoder.fit_transform(dataframe['Label'])

embeddings = []
for text in dataframe['Text']:
    embeddings.append(embedding_text(text))


embeddings_dataframe = pd.DataFrame(embeddings)


X_train, X_test, y_train, y_test = train_test_split(embeddings_dataframe, dataframe['Label'], test_size=0.2, random_state=42)


clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


classifying chunks and aggregating through majority vote

In [5]:
import os
import lxml.etree as ET
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np
from collections import Counter

BERT_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BERT_model = BertModel.from_pretrained('bert-base-uncased')

def chunking_text(text, max = 512):
    inputs = BERT_tokenizer(text, return_tensors = 'pt', max_length = max, truncation = False, padding=False, add_special_tokens=False)
    input_ids = inputs['input_ids'][0]
    chunks = []

    for i in range(0, len(input_ids), max):
        chunk = input_ids[i:i+max]
        chunks.append(chunk)

    return chunks



def embedding_chunks(chunk):
    inputs = {'input_ids': chunk.unsqueeze(0)}
    outputs = summarize_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach.numpy()
    return embeddings.flatten()


def text_extraction(xml_path):

    try:
        tree = ET.parse(path)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

        #extracting text after abstract uptill references

        abstract = root.find(('.//tei:abstract', namespaces=ns))
        references = root.find('.//tei:div[@type="references"]', namespaces=ns)

        if abstract is not None and references is not None:
            abstract_index = list(root.iter()).index(abstract)
            reference_index = list(root.iter()).index(references)
            text = []
            for elem in list(root.iter())[abstract_index:reference_index]:
                text.append(elem.text or "")
            return " ".join(text)
        else:
            return None
    except Exception as e:
        return None


nonfakes_directory = "D:\\new_dataset\\full text xmls\\nonfakes"        #These files can be found in the folder 'full text xmls' in the github repository
fakes_directory = "D:\\new_dataset\\full text xmls\\fakes"


data = {
    'Text': [],
    'Label': [],  # 0 for non-fakes, 1 for fakes
    'Document_ID': []  
}

doc_id = 0


#going through non fakes
for xml in os.listdir(nonfakes_directory):
    if xml.endswith('.xml'):
        xml_path = os.path.join(nonfakes_directory, xml)
        xml_text = text_extraction(xml)
        if xml_text is not None:
            #summarize the extracted text
            summarized_text = summarizing_text(xml_text)
            data['Text'].append(summarized_text)
            data['Label'].append(0)
            data['Document_ID'].append(doc_id)
            doc_id += 1


#going through fakes
for xml in os.listdir(fakes_directory):
    if xml.endswith('.xml'):
        xml_path = os.path.join(fakes_directory, xml)
        xml_text = text_extraction(xml)
        if xml_text is not None:
            #summarize the extracted text
            summarized_text = summarizing_text(xml_text)
            data['Text'].append(summarized_text)
            data['Label'].append(1)
            data['Document_ID'].append(doc_id)
            doc_id += 1



encoder = LabelEncoder()
dataframe['Label'] = encoder.fit_transform(df['Label'])


chunk_embeddings = []
chunk_labels = []
chunk_document_ids = []



for idx, row in dataframe.iterrows():
    text = row['Text']
    label = row['Label']
    document_id = row['Document_ID']
    
 
    chunks = chunking_text(text)
    

    for chunk in chunks:
        embedding = embedding_chunks(chunk)
        chunk_embeddings.append(embedding)
        chunk_labels.append(label)
        chunk_document_ids.append(document_id)


chunk_embeddings_dataframe = pd.DataFrame(chunk_embeddings)


X_train, X_test, y_train, y_test, doc_train, doc_test = train_test_split(
    chunk_embeddings_dataframe, chunk_labels, chunk_document_ids, test_size=0.2, random_state=42
)


clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)


def aggregation(chunk_predictions, ids):
    predictions = {}

    for doc_id in np.unique(ids):
        # Get the predictions for all chunks of this document
        doc_chunk_preds = [chunk_predictions[i] for i, d in enumerate(ids) if d == doc_id]
        
        # Aggregate via majority voting
        most_common_pred = Counter(doc_chunk_preds).most_common(1)[0][0]
        predictions[doc_id] = most_common_pred
    
    return predictions

doc_level_predictions = aggregation(y_pred, doc_test)


doc_true_labels = {doc_id: y_test[i] for i, doc_id in enumerate(doc_test)}


doc_ids_test = list(doc_true_labels.keys())
y_true_doc = [doc_true_labels[doc_id] for doc_id in doc_ids_test]
y_pred_doc = [doc_level_predictions[doc_id] for doc_id in doc_ids_test]

doc_accuracy = accuracy_score(y_true_doc, y_pred_doc)

print(f"Document-level Accuracy: {doc_accuracy}")
print("Document-level Classification Report:")
print(classification_report(y_true_doc, y_pred_doc))



Generating BERT embeddings for each chunk...
Document-level Accuracy: 0.7671084663240465
Document-level Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.93      0.85      2702
           1       0.63      0.33      0.44       995

    accuracy                           0.77      3697
   macro avg       0.71      0.63      0.64      3697
weighted avg       0.75      0.77      0.74      3697

