## Pre-processing, cleaning dataset

Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/3group'

Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re

Load csv files of retracted (both due to error and misconduct) and non-retracted articles

In [None]:
non_retracted = pd.read_csv('Data (CSV)/non_retracted.csv', encoding="utf-8-sig")

In [None]:
retracted = pd.read_csv('Data (CSV)/sub_retracted.csv', encoding="utf-8-sig")

Some words have been cut off with a - and new line. To get the original words, the -\n is removed

In [None]:
non_retracted = non_retracted.replace({'-\n': ''}, regex = True)

In [None]:
retracted = retracted.replace({'-\n': ''}, regex = True)

Remove \n for both retracted and non-retracted papers

In [None]:
non_retracted = non_retracted.replace({'\n': ' '}, regex = True)

In [None]:
retracted = retracted.replace({'\n': ' '}, regex = True)

Concat to one dataframe

In [None]:
papers = pd.concat([non_retracted, retracted], ignore_index=True)

In [None]:
papers.head(2)

In [None]:
papers.columns

Remove rows with missing values

In [None]:
papers['Content'].dropna(inplace=True)

Check for duplicates (retracted)

In [None]:
retracted.columns #check columns

In [None]:
dup = papers[papers.duplicated(subset=['RetractionDOI'])]
dup_rdois = list(dup.RetractionDOI.unique()) #unique dois

notremove = ['unavailable', np.nan]
toremove = [duprdoi for duprdoi in dup_rdois if duprdoi not in notremove]

papers = papers[~papers.RetractionDOI.isin(toremove)] #removes papers with duplicate DOIs from df

In [None]:
len(papers)

Check for duplicates (non-retracted)

In [None]:
non_retracted.columns #check columns

In [None]:
non_retracted[non_retracted.duplicated(subset=['DOI_y'])]

Original scraped content is kept as 'All content'

In [None]:
papers['All content'] = papers['Content']

Check / remove retraction notice

In [None]:
len(papers[papers['All content'].str.contains('Notice of Retraction')==True]) #amount of papers containing a retraction notice

In [None]:
papers.Content = papers.Content.str.replace(r'^Notice of Retraction.+ieee\.org\.', '', regex = True) #code to remove retraction notice

Remove papers from non-retracted that are actually retracted

In [None]:
len(papers)

In [None]:
non_retracted = papers[papers['class'] == 'nr']
doi_toremove = non_retracted[non_retracted["Content"].str.contains('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw')==True].DOI_x #remove if non-retracted contain any of these words
papers = papers[~papers.DOI_x.isin(doi_toremove)]

In [None]:
len(papers)

Splitting the content into 4 parts: 1) Abstract, 2) main content (Introduction-Results), 3) Discussion/Conclusion and 4) References. 

In [None]:
papers['References'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[0]

In [None]:
papers.Content.head(2)

In [None]:
papers['Text'] = papers['Content']

In [None]:
papers['Discussion / Conclusion'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[0]

In [None]:
papers['Main content'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[1]
papers['Title + Abstract'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[0]
papers.drop('Content', axis=1, inplace=True)

Check for null values (NaN/None) in References: if references are missing, it is an indicator that not the original article was scraped but some text relating to the article. Thus, we decide to exclude those rows with missing values for references.

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

In [None]:
no_references = papers[papers['References'].isnull()]
papers = papers[~papers.ID.isin(no_references.ID)] #remove papers without reference

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Removal of papers wich do not follow typical reasearch paper structure, i.e. dont contain the words Introduction and Discussion or Conclusion

In [None]:
no_introduction = papers[papers['Main content'].isnull()]
papers = papers[~papers.ID.isin(no_introduction.ID)] #remove papers without introduction

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

In [None]:
no_discussion = papers[papers['Discussion / Conclusion'].isnull()]
papers = papers[~papers.ID.isin(no_discussion.ID)] #remove papers without discussion/conclusion

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Check journals

In [None]:
papers['Journal_Name'] = papers['Journal'].fillna(papers['Source_Title']) #first, journal names from retracted are in the column Journal, from non-retracted in Source_Titel
#this code puts journal names from all groups in the column Journal_Name

In [None]:
papers['Journal_Name']  = papers['Journal_Name'].str.lower() #lowercase journal names

In [None]:
set(papers['Journal_Name']) #print all journal names

Some journal names are written differently in retracted vs. non-retracted papers. So journal names will be altered to be the same for all groups.
Further, replace non-numeric characters that are included in some journal names

In [None]:
papers['Journal_Name'] = papers['Journal_Name'].str.replace('artificial cells, nanomedicine, and biotechnology', 'artificial cells nanomedicine and biotechnology', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('clinical cancer research: an official journal of the american association for cancer research', 'clinical cancer research', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('evidence-based complementary and alternative medicine (ecam)', 'evidence-based complementary and alternative medicine', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of bone and mineral research: the official journal of the american society for bone and mineral research', 'journal of bone and mineral research', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of controlled release: official journal of the controlled release society', 'journal of controlled release', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of neurochemistry (jnc)', 'journal of neurochemistry', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of the american college of cardiology (jacc)', 'journal of the american college of cardiology', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('materials science and engineering a-structural materials properties microstructure and processing', 'materials science and engineering', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('materials science and engineering: a', 'materials science and engineering', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('molecular cancer research (mcr)', 'molecular cancer research', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace("naunyn-schmiedeberg's archives of pharmacology", 'naunyn-schmiedebergs archives of pharmacology', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('spectrochimica acta part a, molecular and biomolecular spectroscopy', 'spectrochimica acta part a', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the embo journal', 'embo journal', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the journal of biological chemistry', 'journal of biological chemistry', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the journal of neuroscience : the official journal of the society for neuroscience', 'journal of neuroscience', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the scientific world journal', 'scientific world journal', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('tumor biology (tumour biology) - official journal of the international society of oncology and biomarkers (isobm)', 'tumor biology', regex = False)

In [None]:
len(set(papers['Journal_Name'])) #amount of journals

In [None]:
papers['Journal_Name'].value_counts() #check amount of papers contained in journals

Remove journals that are only present in one or two classes


In [None]:
papers_per_journal_class = papers.groupby(['Journal_Name', 'class']).size()

In [None]:
papers_per_journal_class = papers_per_journal_class.to_frame()

In [None]:
papers_per_journal_class = papers_per_journal_class.unstack(level='class')

In [None]:
papers_per_journal_class #check how many papers a journal contains from each class

In [None]:
na_journals_df = papers_per_journal_class[papers_per_journal_class.isna().any(axis=1)]
na_journals_df #journals which contain no papers for one of the classes

In [None]:
na_journals = ['anaesthesia', 'biochemistry','cancer biomarkers','cancer management and research','cancer science','carcinogenesis','cell death & disease','cellular physiology and biochemistry','chemical communications','clinical cancer research','diabetes','diagnostic pathology','european journal of anaesthesiology','international journal of cancer','international journal of cardiology','international journal of hydrogen energy','journal of materials chemistry a','journal of neurochemistry','journal of physics: conference series','journal of the american college of cardiology','journal of thoracic and cardiovascular surgery','journal of virology','microbiology','molecular and cellular biology','molecular cancer research','molecular cell','neuroscience letters','nucleic acids research','oncology reports','procedia - social and behavioral sciences','science','scientific reports','spectrochimica acta part a']
#journals which contain no papers for one of the classes

In [None]:
papers = papers[~papers.Journal_Name.isin(na_journals)] # remove journals which contain no papers for one of the classes

Check size of classes

In [None]:
len(papers) #final size of dataset

In [None]:
len(papers[papers['class'] == 'nr']) #size of non-retracted class

In [None]:
len(papers[papers['class'] == 'e']) #size of error class

In [None]:
len(papers[papers['class'] == 'm']) #size of misconduct class

In [None]:
len(set(papers.Journal_Name)) #number of journals

Define the important columns

In [None]:
papers.columns

In [None]:
papers = papers[['ID','All content', 'Text', 'Journal_Name', 'class', 'Retracted']] #create subset with only important columns

Investigate the final dataframe

In [None]:
papers['ID'].iloc[0:10]

In [None]:
papers['Text'].iloc[0:7]

Save the output to CSV

In [None]:
papers.to_csv('Data (CSV)/preprocessing_part_1.csv')