## Pre-processing, cleaning dataset

Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/3group'

/content/drive/MyDrive/Thesis_UU/Common_20_05_2022/Code


Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re

Load csv files of retracted (both due to error and misconduct) and non-retracted articles

In [None]:
non_retracted = pd.read_csv('Data (CSV)/non_retracted.csv', encoding="utf-8-sig")

In [None]:
retracted = pd.read_csv('Data (CSV)/sub_retracted.csv', encoding="utf-8-sig")

Some words have been cut off with a - and new line. To get the original words, the -\n is removed

In [None]:
non_retracted = non_retracted.replace({'-\n': ''}, regex = True)

In [None]:
retracted = retracted.replace({'-\n': ''}, regex = True)

Remove \n for both retracted and non-retracted papers

In [None]:
non_retracted = non_retracted.replace({'\n': ' '}, regex = True)

In [None]:
retracted = retracted.replace({'\n': ' '}, regex = True)

Concat to one dataframe

In [None]:
papers = pd.concat([non_retracted, retracted], ignore_index=True)

In [None]:
papers.head(2)

Unnamed: 0.2,Unnamed: 0,DOI_x,Content,Retracted,ID,Unnamed: 0.1,Publication Type,Authors,Book Authors,Book Editors,...,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes,misconduct
0,0,nr4461.pdf,CORRECTION Correction: Spinal Cord Transection...,0,nr4461,4461,J,,,,...,,,,,,,,,,
1,1,nr4462.pdf,Correction Correction: Genome-Wide Comparative...,0,nr4462,4462,J,,,,...,,,,,,,,,,


In [None]:
papers.columns

Index(['Unnamed: 0', 'DOI_x', 'Content', 'Retracted', 'ID', 'Unnamed: 0.1',
       'Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       ...
       'RetractionDOI', 'RetractionPubMedID', 'OriginalPaperDate',
       'OriginalPaperDOI', 'OriginalPaperPubMedID', 'RetractionNature',
       'Reason', 'Paywalled', 'Notes', 'misconduct'],
      dtype='object', length=101)

Remove rows with missing values

In [None]:
papers['Content'].dropna(inplace=True)

Check for duplicates (retracted)

In [None]:
retracted.columns #check columns

Index(['Unnamed: 0', 'DOI', 'Content', 'Retracted', 'ID', 'Unnamed: 0.1',
       'Unnamed: 0.1.1', 'Record_ID', 'Title', 'Subject', 'Institution',
       'Journal', 'Publisher', 'Country', 'Author', 'URLS', 'ArticleType',
       'RetractionDate', 'RetractionDOI', 'RetractionPubMedID',
       'OriginalPaperDate', 'OriginalPaperDOI', 'OriginalPaperPubMedID',
       'RetractionNature', 'Reason', 'Paywalled', 'Notes', 'misconduct',
       'index', 'class', 'year'],
      dtype='object')

In [None]:
dup = papers[papers.duplicated(subset=['RetractionDOI'])]
dup_rdois = list(dup.RetractionDOI.unique()) #unique dois

notremove = ['unavailable', np.nan]
toremove = [duprdoi for duprdoi in dup_rdois if duprdoi not in notremove]

papers = papers[~papers.RetractionDOI.isin(toremove)] #removes papers with duplicate DOIs from df

In [None]:
len(papers)

4247

Check for duplicates (non-retracted)

In [None]:
non_retracted.columns #check columns

Index(['Unnamed: 0', 'DOI_x', 'Content', 'Retracted', 'ID', 'Unnamed: 0.1',
       'Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group Authors', 'Article Title', 'Source_Title', 'Book Series Title',
       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',
       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Name Preferred',
       'Funding Text', 'Cited References', 'Cited Reference Count',
       'Times Cited, WoS Core', 'Times Cited, All Databases',
       '180 Day Usage Count', 'Since 2013 Usage Count', 'Publisher',
       'Publisher City', 'Publisher Address', 'ISSN', 'eISSN', 'ISBN',
       'Journal Abbreviation', 'Journal ISO Abbrevia

In [None]:
non_retracted[non_retracted.duplicated(subset=['DOI_y'])]

Unnamed: 0.2,Unnamed: 0,DOI_x,Content,Retracted,ID,Unnamed: 0.1,Publication Type,Authors,Book Authors,Book Editors,...,IDS Number,Pubmed Id,Open Access Designations,Highly Cited Status,Hot Paper Status,Date of Export,UT (Unique WOS ID),index,class,year


Original scraped content is kept as 'All content'

In [None]:
papers['All content'] = papers['Content']

Check / remove retraction notice

In [None]:
len(papers[papers['All content'].str.contains('Notice of Retraction')==True]) #amount of papers containing a retraction notice

0

In [None]:
papers.Content = papers.Content.str.replace(r'^Notice of Retraction.+ieee\.org\.', '', regex = True) #code to remove retraction notice

Remove papers from non-retracted that are actually retracted

In [None]:
len(papers)

4247

In [None]:
non_retracted = papers[papers['class'] == 'nr']
doi_toremove = non_retracted[non_retracted["Content"].str.contains('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw')==True].DOI_x #remove if non-retracted contain any of these words
papers = papers[~papers.DOI_x.isin(doi_toremove)]

In [None]:
len(papers)

4114

Splitting the content into 4 parts: 1) Abstract, 2) main content (Introduction-Results), 3) Discussion/Conclusion and 4) References. 

In [None]:
papers['References'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[0]

In [None]:
papers.Content.head(2)

0    CORRECTION Correction: Spinal Cord Transection...
1    Correction Correction: Genome-Wide Comparative...
Name: Content, dtype: object

In [None]:
papers['Text'] = papers['Content']

In [None]:
papers['Discussion / Conclusion'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[0]

In [None]:
papers['Main content'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[1]
papers['Title + Abstract'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[0]
papers.drop('Content', axis=1, inplace=True)

Check for null values (NaN/None) in References: if references are missing, it is an indicator that not the original article was scraped but some text relating to the article. Thus, we decide to exclude those rows with missing values for references.

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 4114
Missing main content: 2111
Missing discussion/conclusion: 1143
Missing references: 855


In [None]:
no_references = papers[papers['References'].isnull()]
papers = papers[~papers.ID.isin(no_references.ID)] #remove papers without reference

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 3259
Missing main content: 1345
Missing discussion/conclusion: 553
Missing references: 0


Removal of papers wich do not follow typical reasearch paper structure, i.e. dont contain the words Introduction and Discussion or Conclusion

In [None]:
no_introduction = papers[papers['Main content'].isnull()]
papers = papers[~papers.ID.isin(no_introduction.ID)] #remove papers without introduction

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 1914
Missing main content: 0
Missing discussion/conclusion: 180
Missing references: 0


In [None]:
no_discussion = papers[papers['Discussion / Conclusion'].isnull()]
papers = papers[~papers.ID.isin(no_discussion.ID)] #remove papers without discussion/conclusion

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 1734
Missing main content: 0
Missing discussion/conclusion: 0
Missing references: 0


Check journals

In [None]:
papers['Journal_Name'] = papers['Journal'].fillna(papers['Source_Title']) #first, journal names from retracted are in the column Journal, from non-retracted in Source_Titel
#this code puts journal names from all groups in the column Journal_Name

In [None]:
papers['Journal_Name']  = papers['Journal_Name'].str.lower() #lowercase journal names

In [None]:
set(papers['Journal_Name']) #print all journal names

{'acs applied materials & interfaces',
 'anaesthesia',
 'artificial cells nanomedicine and biotechnology',
 'artificial cells, nanomedicine, and biotechnology',
 'biochemical pharmacology',
 'biochemistry',
 'biomed research international',
 'blood',
 'brain research',
 'canadian journal of physics',
 'cancer biomarkers',
 'cancer gene therapy',
 'cancer letters',
 'cancer management and research',
 'cancer research',
 'cancer science',
 'carcinogenesis',
 'cell',
 'cell cycle',
 'cell death & disease',
 'cell metabolism',
 'cellular physiology and biochemistry',
 'chemical communications',
 'clinical cancer research',
 'clinical cancer research: an official journal of the american association for cancer research',
 'construction and building materials',
 'diabetes',
 'diagnostic pathology',
 'embo journal',
 'european journal of anaesthesiology',
 'evidence-based complementary and alternative medicine',
 'evidence-based complementary and alternative medicine (ecam)',
 'experimental an

Some journal names are written differently in retracted vs. non-retracted papers. So journal names will be altered to be the same for all groups.
Further, replace non-numeric characters that are included in some journal names

In [None]:
papers['Journal_Name'] = papers['Journal_Name'].str.replace('artificial cells, nanomedicine, and biotechnology', 'artificial cells nanomedicine and biotechnology', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('clinical cancer research: an official journal of the american association for cancer research', 'clinical cancer research', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('evidence-based complementary and alternative medicine (ecam)', 'evidence-based complementary and alternative medicine', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of bone and mineral research: the official journal of the american society for bone and mineral research', 'journal of bone and mineral research', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of controlled release: official journal of the controlled release society', 'journal of controlled release', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of neurochemistry (jnc)', 'journal of neurochemistry', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('journal of the american college of cardiology (jacc)', 'journal of the american college of cardiology', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('materials science and engineering a-structural materials properties microstructure and processing', 'materials science and engineering', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('materials science and engineering: a', 'materials science and engineering', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('molecular cancer research (mcr)', 'molecular cancer research', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace("naunyn-schmiedeberg's archives of pharmacology", 'naunyn-schmiedebergs archives of pharmacology', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('spectrochimica acta part a, molecular and biomolecular spectroscopy', 'spectrochimica acta part a', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the embo journal', 'embo journal', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the journal of biological chemistry', 'journal of biological chemistry', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the journal of neuroscience : the official journal of the society for neuroscience', 'journal of neuroscience', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('the scientific world journal', 'scientific world journal', regex = False)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('tumor biology (tumour biology) - official journal of the international society of oncology and biomarkers (isobm)', 'tumor biology', regex = False)

In [None]:
len(set(papers['Journal_Name'])) #amount of journals

75

In [None]:
papers['Journal_Name'].value_counts() #check amount of papers contained in journals

plos one                                          252
molecular medicine reports                        115
journal of cellular biochemistry                  110
tumor biology                                      95
experimental and therapeutic medicine              74
                                                 ... 
journal of thoracic and cardiovascular surgery      1
science                                             1
diabetes                                            1
chemical communications                             1
journal of neurochemistry                           1
Name: Journal_Name, Length: 75, dtype: int64

Remove journals that are only present in one or two classes


In [None]:
papers_per_journal_class = papers.groupby(['Journal_Name', 'class']).size()

In [None]:
papers_per_journal_class = papers_per_journal_class.to_frame()

In [None]:
papers_per_journal_class = papers_per_journal_class.unstack(level='class')

In [None]:
papers_per_journal_class #check how many papers a journal contains from each class

Unnamed: 0_level_0,0,0,0
class,e,m,nr
Journal_Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
acs applied materials & interfaces,2.0,7.0,31.0
anaesthesia,,,10.0
artificial cells nanomedicine and biotechnology,2.0,12.0,42.0
biochemical pharmacology,2.0,2.0,1.0
biochemistry,,1.0,3.0
...,...,...,...
scientific reports,,,2.0
scientific world journal,1.0,3.0,3.0
spectrochimica acta part a,1.0,7.0,
thin solid films,2.0,2.0,9.0


In [None]:
na_journals_df = papers_per_journal_class[papers_per_journal_class.isna().any(axis=1)]
na_journals_df #journals which contain no papers for one of the classes

Unnamed: 0_level_0,0,0,0
class,e,m,nr
Journal_Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
anaesthesia,,,10.0
biochemistry,,1.0,3.0
cancer biomarkers,3.0,,10.0
cancer management and research,1.0,,15.0
cancer science,1.0,1.0,
carcinogenesis,3.0,5.0,
cell death & disease,,1.0,3.0
cellular physiology and biochemistry,,,12.0
chemical communications,,,1.0
clinical cancer research,1.0,,3.0


In [None]:
na_journals = ['anaesthesia', 'biochemistry','cancer biomarkers','cancer management and research','cancer science','carcinogenesis','cell death & disease','cellular physiology and biochemistry','chemical communications','clinical cancer research','diabetes','diagnostic pathology','european journal of anaesthesiology','international journal of cancer','international journal of cardiology','international journal of hydrogen energy','journal of materials chemistry a','journal of neurochemistry','journal of physics: conference series','journal of the american college of cardiology','journal of thoracic and cardiovascular surgery','journal of virology','microbiology','molecular and cellular biology','molecular cancer research','molecular cell','neuroscience letters','nucleic acids research','oncology reports','procedia - social and behavioral sciences','science','scientific reports','spectrochimica acta part a']
#journals which contain no papers for one of the classes

In [None]:
papers = papers[~papers.Journal_Name.isin(na_journals)] # remove journals which contain no papers for one of the classes

Check size of classes

In [None]:
len(papers) #final size of dataset

1484

In [None]:
len(papers[papers['class'] == 'nr']) #size of non-retracted class

941

In [None]:
len(papers[papers['class'] == 'e']) #size of error class

158

In [None]:
len(papers[papers['class'] == 'm']) #size of misconduct class

385

In [None]:
len(set(papers.Journal_Name)) #number of journals

42

Define the important columns

In [None]:
papers.columns

Index(['Unnamed: 0', 'DOI_x', 'Retracted', 'ID', 'Unnamed: 0.1',
       'Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors',
       ...
       'Paywalled', 'Notes', 'misconduct', 'All content', 'References', 'Text',
       'Discussion / Conclusion', 'Main content', 'Title + Abstract',
       'Journal_Name'],
      dtype='object', length=107)

In [None]:
papers = papers[['ID','All content', 'Text', 'Journal_Name', 'class', 'Retracted']] #create subset with only important columns

Investigate the final dataframe

In [None]:
papers['ID'].iloc[0:10]

10    nr4472
11    nr4473
12    nr4474
16    nr4479
17    nr4480
18    nr4481
21    nr4486
22    nr4487
24    nr4613
26    nr4615
Name: ID, dtype: object

In [None]:
papers['Text'].iloc[0:7]

10    RESEARCH ARTICLE Twins! Microsatellite analysi...
11    Is There ‘Anther-Anther Interference’ within a...
12    FORMAL COMMENT Areas of Agreement and Disagree...
16    RESEARCH ARTICLE Multi-start heuristic approac...
17    The Effect of Plant Tissue and Vaccine Formula...
18    RESEARCH ARTICLE One Giant Leap for Categorize...
21    RESEARCH ARTICLE Fluoride export (FEX) protein...
Name: Text, dtype: object

Save the output to CSV

In [None]:
papers.to_csv('Data (CSV)/preprocessing_part_1.csv')