### This notebook shows the classification of fake papers based on metadata and journal features as well as TF-IDF and BERT embeddings

Each cell has a description above it. If there are doubts in any parts of the code please contact at (ahmar.hussain@ovgu.de)

First we import the dataset  (It is in the Github repository named: 'Classification-of-fake-papers-in-biomedicine-with-machine-learning/dataset without full text with masked features.csv')

In [1]:
import pandas as pd

data = pd.read_csv('D:\\new_dataset\\final_dataset_bigger.csv')

checking column names

In [3]:
data.columns

Index(['PMID', 'DOI', 'citation_count', 'affiliation_country',
       'numer_of_authors', 'open_access', 'orcid_availability',
       'hospital_affiliation', 'title_count', 'abstract_count',
       'journal_paper', 'title_paper', 'abstract_paper', 'target_variable',
       'journal_year', 'Total Citations', 'Journal impact factor',
       'JIF without self cites', 'Immediacy Index', 'Citable items',
       '% of articles in Citable items', 'Average JIF Percentile',
       'Total Articles', 'Total Reviews', 'Cited Half-Life',
       'Citing Half-Life', 'Eigenfactor score', 'Normalized Eigenfactor'],
      dtype='object')

visual inspection

In [6]:
data.head(10)

Unnamed: 0,PMID,DOI,citation_count,affiliation_country,numer_of_authors,open_access,orcid_availability,hospital_affiliation,title_count,abstract_count,...,Immediacy Index,Citable items,% of articles in Citable items,Average JIF Percentile,Total Articles,Total Reviews,Cited Half-Life,Citing Half-Life,Eigenfactor score,Normalized Eigenfactor
0,30867252,10.1042/BSR20190043,45,China,4,True,False,False,12,246,...,0.8,122,67.21,58.9,82,40.0,4.5,8.4,0.0145,3.16874
1,31002124,10.26355/eurrev_201904_17547,7,China,3,False,False,True,12,168,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
2,31858537,10.26355/eurrev_201912_19768,6,China,9,False,False,True,15,215,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
3,31298331,10.26355/eurrev_201907_18318,18,China,6,False,False,True,13,237,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
4,31114982,10.26355/eurrev_201905_17780,4,China,6,False,False,True,15,203,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
5,31858552,10.26355/eurrev_201912_19787,10,China,6,False,False,True,14,216,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
6,31210306,10.26355/eurrev_201906_18059,10,China,8,False,False,False,15,182,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
7,31378884,10.26355/eurrev_201908_18528,3,China,2,False,False,True,15,211,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
8,31298312,10.26355/eurrev_201907_18294,23,China,5,False,False,True,13,238,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418
9,31298333,10.26355/eurrev_201907_18320,2,China,6,False,False,True,15,205,...,0.4,1118,86.94,50.5,972,146.0,3.8,6.6,0.02441,5.31418


converting boolean entries (True and False) to integers (1 and 0)

In [7]:
data['orcid_availability'] = data['orcid_availability'].astype(int)
data['open_access'] = data['open_access'].astype(int)
data['hospital_affiliation'] = data['hospital_affiliation'].astype(int)

one hot encoding affiliation_country and journal_paper column

In [8]:
#affiliation_country
data = pd.get_dummies(data, columns = ['affiliation_country'])
one_hot_encoded_country_columns = [col for col in data.columns if 'affiliation_' in col]    #converting boolean values to integer
data[one_hot_encoded_country_columns] = data[one_hot_encoded_country_columns].astype(int)

#journal_paper
data = pd.get_dummies(data, columns = ['journal_paper'])
one_hot_encoded_country_columns = [col for col in data.columns if 'journal_paper' in col]    #converting boolean values to integer
data[one_hot_encoded_country_columns] = data[one_hot_encoded_country_columns].astype(int)

Now we preprocess the abstract + title of the papers by lemmatizing them

In [10]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

def lemmatize_text(abstract_and_title):
    lemmatizer = WordNetLemmatizer()
    abstract_and_title = abstract_and_title.lower()
    tokens = word_tokenize(abstract_and_title)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)


data['abstract+title'] = (data['abstract_paper'] + ' ' + data['title_paper']).apply(lemmatize_text)

### Now we apply different methods to handle the abstract+title columns

first we used TF-IDF (only run this cell if you want to use TF-IDf)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stopwords.extend(['background', 'methods', 'results', 'conclusions', 'conclusion', 'method'])  #we remove these words as well to avoid bias of any abstract structure for specific journals
vectorizer = TfidfVectorizer(stop_words = stopwords)
tfidf_vectors = vectorizer.fit_transform(data['abstract+title'])
tfidf_df = pd.DataFrame(tfidf_vectors.toarray(), columns = vectorizer.get_feature_names_out())

next we use word2vec (only run this cell if you want to use word2vec word embeddings)

In [None]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Initialize stopwords and add custom stopwords
stopwords = stopwords.words('english')
stopwords.extend(['background', 'methods', 'results', 'conclusions', 'conclusion', 'method'])

data['abstract_tokenized'] = data['abstract_lemmatized'].apply(lambda x: [word for word in word_tokenize(x.lower()) if word not in stopwords])

word2vec_model = Word2Vec(sentences=data['abstract_tokenized'], vector_size=100, window=5, min_count=1, workers=4)

def create_document_vector(document):
    document = [word for word in document if word in word2vec_model.wv.key_to_index]

    if len(document) == 0:
        return np.zeros(100)
    
    return np.mean(word2vec_model.wv[document], axis = 0)

data['word2vec_vector'] = data['abstract_tokenized'].apply(create_document_vector)

word2vec_df = pd.DataFrame(data['word2vec_vector'].tolist())

next we use BERT embeddings (only run this cell if you want to use BERT embeddings)

the models used are the following, please change the LLM_model_name variable for different models

dmis-lab/biobert-v1.1  
emilyalsentzer/Bio_ClinicalBERT  
microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext  
allenai/scibert_scivocab_uncased  
bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16  
emilyalsentzer/Bio_ClinicalBERT  
microsoft/biogpt

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np


LLM_model_name = 'dmis-lab/biobert-v1.1'   # this can be changed to any model name used in the paper
tokenizer = AutoTokenizer.from_pretrained(LLM_model_name)
LLM_model = AutoModel.from_pretrained(LLM_model_name)

def encoding_sentences(sentences):    # function to encode sentences
    embeddings = []

    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = LLM_model(**inputs)
        # Mean pooling to get sentence embeddings
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)


sentence_embeddings = encoding_sentences(data['abstract_lemmatized'])
embedding_df = pd.DataFrame(sentence_embeddings)       #embeddings converted to a dataframe


now we combines the metadata and journal features with the textual features

In [None]:
combined_data = pd.concat([data, tfidf_df], axis=  1)   # run this for TD_IDF

In [None]:
combined_data = pd.concat([data, word2vec_df], axis=  1)           # run this for word2vec

In [None]:
combined_data = pd.concat([data, embedding_df], axis=  1)     # run this for BERT

now we split the data for training

In [None]:
#for using just metatdata and journal features

X = data.drop(columns = ['DOI', 'PMID', 'abstract_paper', 'target_variable', 'abstract+title', 'title_paper'])
y = data.target_variable

In [None]:
#for using just TF-IDF

X = combined_data.drop(columns = ['DOI', 'PMID', 'abstract_paper', 'target_variable', 'abstract+title', 'title_paper'])
y = combined_data.target_variable

In [None]:
#for using just word2vec

X = combined_data.drop(columns = ['DOI', 'PMID', 'abstract_paper', 'target_variable', 'abstract+title', 'title_paper', 'word2vec_vector'])
y = combined_data.target_variable

In [None]:
#for using just BERT features

X = combined_data.drop(columns = ['DOI', 'PMID', 'abstract_paper', 'target_variable', 'abstract+title', 'title_paper'])
y = combined_data.target_variable

In [None]:
#for TF-IDF with metatdata and journal features

X = combined_data.drop(columns = ['DOI', 'PMID', 'abstract_paper', 'target_variable', 'abstract+title', 'title_paper'])
y = combined_data.target_variable

In [None]:
#for word2vec with metatdata and journal features

X = combined_data.drop(columns = ['PMID', 'DOI', 'abstract_paper', 'target_variable', 'abstract_lemmatized', 'title_paper'])
y = combined_data.target_variable

In [None]:
#for BERT based models with metatdata and journal features

X = combined_data.drop(columns = ['PMID', 'DOI', 'abstract_paper', 'target_variable', 'abstract_lemmatized', 'title_paper', 'abstract_tokenized'])
y = combined_data.target_variable

classification and evaluation

first for Gradient bossting classifier we find the best hyperparameters

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)


parameter_grid = {
    'n_estimators': [100, 200, 300],         # Number of trees in the forest
    'learning_rate': [0.01, 0.1, 0.2],       # Learning rate
    'min_samples_split': [2, 5, 10]         # Minimum number of samples required to split an internal node
}


model_gb = GradientBoostingClassifier()


grid_search_gb = GridSearchCV(estimator=model_gb, param_grid=parameter_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='recall')


grid_search_gb.fit(X_train, y_train)


best_gb = grid_search_gb.best_estimator_


y_val_pred = best_gb.predict(X_val)


print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))


y_test_pred = best_gb.predict(X_test)


print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


print("Best Parameters: ", grid_search_gb.best_params_)

next we train and test on the best hyperparameters

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 2022,
    stratify = y
)


model_gb_best = GradientBoostingClassifier(min_samples_split = 10, n_estimators = 300, learning_rate= 0.2)

model_gb_best.fit(X_train, y_train)

y_pred = model_gb_best.predict(X_test)

print(classification_report(y_test, y_pred))

now for logistic regression

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)


parameter_grid = {
    'C': [0.01, 0.1, 1, 10, 100],           # Inverse of regularization strength
    'penalty': ['l2'],                      # Regularization type (L2 regularization)
    'solver': ['liblinear', 'saga']         # Algorithms to use in the optimization problem
}


model_lr = LogisticRegression()


grid_search_lr = GridSearchCV(estimator=model_lr, param_grid=parameter_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='recall')


grid_search_lr.fit(X_train, y_train)


best_lr = grid_search_lr.best_estimator_


y_val_pred = best_lr.predict(X_val)


print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))


y_test_pred = best_lr.predict(X_test)


print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


print("Best Parameters: ", grid_search_lr.best_params_)

next we train and test on the best hyperparameters

In [None]:
from sklearn.metrics import classification_report, 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 2022,
    stratify = y
)


model_lr_best = LogisticRegression(max_iter=1000, C = 0.1, penalty='l2', solver='liblinear')

model_lr_best.fit(X_train, y_train)

y_pred = model_lr_best.predict(X_test)

print(classification_report(y_test, y_pred))

now for random forest classifier

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)


parameter_grid = {
    'n_estimators': [100, 200, 300],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],         # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]           # Minimum number of samples required to be at a leaf node
}


model_rf = RandomForestClassifier()


grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=parameter_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='recall')


grid_search_rf.fit(X_train, y_train)


best_rf = grid_search_rf.best_estimator_


y_val_pred = best_rf.predict(X_val)


print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))


y_test_pred = best_rf.predict(X_test)


print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


print("Best Parameters: ", grid_search_rf.best_params_)

next we train and test on the best hyperparameters

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 2022,
    stratify = y
)


model_rf_best = RandomForestClassifier(n_estimators=200, min_samples_split=2, max_depth=30)

model_rf_best.fit(X_train, y_train)

y_pred = model_rf_best.predict(X_test)

print(classification_report(y_test, y_pred))

now for Naive Bayes classifier

In [None]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 2022,
    stratify = y
)


model_nb_best = GaussianNB()

model_nb_best.fit(X_train, y_train)

y_pred = model_nb_best.predict(X_test)

print(classification_report(y_test, y_pred))

now for decision trees

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42, stratify=y_train_full)


parameter_grid = {
    'max_depth': [None, 10, 20, 30],         # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],         # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]         # Function to measure the quality of a split
}


model_dt = DecisionTreeClassifier()


grid_search_dt = GridSearchCV(estimator=model_dt, param_grid=parameter_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='recall')


grid_search_dt.fit(X_train, y_train)


best_dt = grid_search_dt.best_estimator_


y_val_pred = best_dt.predict(X_val)


print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))


y_test_pred = best_dt.predict(X_test)


print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


print("Best Parameters: ", grid_search_dt.best_params_)

next we train and test on the best hyperparameters

In [None]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 2022,
    stratify = y
)


model_dt_best = DecisionTreeClassifier(min_samples_split=2, max_depth=None, min_samples_leaf=1)

model_dt_best.fit(X_train, y_train)

y_pred = model_dt_best.predict(X_test)

print(classification_report(y_test, y_pred))