## Pre-processing, cleaning dataset

Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/2group/'

Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import sys

Load csv files of retracted and non-retracted articles

In [None]:
non_retracted = pd.read_csv('Data (CSV)/non_retracted_11_journals_w_meta.csv', encoding="utf-8-sig")

In [None]:
retracted = pd.read_csv('Data (CSV)/retracted_11_journals_w_meta.csv', encoding="utf-8-sig")

Some words have been cut off with a - and new line. To get the original words, the -\n is removed

In [None]:
non_retracted = non_retracted.replace({'-\n': ''}, regex = True)

In [None]:
retracted = retracted.replace({'-\n': ''}, regex = True)

Remove \n for both retracted and non-retracted papers

In [None]:
non_retracted = non_retracted.replace({'\n': ' '}, regex = True)

In [None]:
non_retracted.columns

In [None]:
non_retracted.rename(columns={'Source Title': 'Source_Title'}, inplace = True)

In [None]:
non_retracted.Source_Title.head(2)

In [None]:
retracted = retracted.replace({'\n': ' '}, regex = True)

In [None]:
retracted.columns

Concat to one dataframe

In [None]:
papers = pd.concat([non_retracted, retracted], ignore_index=True)

In [None]:
len(papers)

In [None]:
papers.head(2)

In [None]:
papers.columns

Remove rows with missing values

In [None]:
papers['Content'].dropna(inplace=True)

Check for duplicates (retracted)

In [None]:
retracted.columns

In [None]:
dup = papers[papers.duplicated(subset=['RetractionDOI'])]
dup_rdois = list(dup.RetractionDOI.unique()) #unique dois

notremove = ['unavailable', np.nan]
toremove = [duprdoi for duprdoi in dup_rdois if duprdoi not in notremove]

papers = papers[~papers.RetractionDOI.isin(toremove)] #removes papers with duplicate DOIs from df

In [None]:
len(papers)

Check for duplicates (non-retracted)

In [None]:
non_retracted.columns

In [None]:
non_retracted[non_retracted.duplicated(subset=['DOI_y'])] #no duplicate dois

Original scraped content is kept as 'All content'

In [None]:
papers['All content'] = papers['Content']

Check / remove retraction notice

In [None]:
len(papers[papers['All content'].str.contains('Notice of Retraction')==True]) #amount of papers containing a retraction notice

In [None]:
papers.Content = papers.Content.str.replace(r'^Notice of Retraction.+ieee\.org\.', '', regex = True) #code to remove retraction notice

Remove papers from non-retracted that are actually retracted

In [None]:
len(papers)

In [None]:
non_retracted = papers[papers['Retracted'] == 0]
doi_toremove = non_retracted[non_retracted["Content"].str.contains('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw')==True].DOI_x
#remove if non-retracted contain any of these words
papers = papers[~papers.DOI_x.isin(doi_toremove)]

In [None]:
len(papers)

Splitting the content into 4 parts: 1) Abstract, 2) main content (Introduction-Results), 3) Discussion/Conclusion and 4) References. 

In [None]:
papers['References'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[0]

In [None]:
papers.Content.head(2)

In [None]:
papers['Text'] = papers['Content']

In [None]:
papers['Discussion / Conclusion'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[0]

In [None]:
papers['Main content'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[1]
papers['Title + Abstract'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[0]
papers.drop('Content', axis=1, inplace=True)

Check for null values (NaN/None) in References: if references are missing, it is an indicator that not the original article was scraped but some text relating to the article. Thus, we decide to exclude those rows with missing values for references.

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

In [None]:
no_references = papers[papers['References'].isnull()]
papers = papers[~papers.ID.isin(no_references.ID)] #remove papers without reference

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Removal of papers wich do not follow typical reasearch paper structure, i.e. dont contain the words Introduction and Discussion or Conclusion

In [None]:
no_introduction = papers[papers['Main content'].isnull()]
papers = papers[~papers.ID.isin(no_introduction.ID)] #remove papers without introduction

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

In [None]:
no_discussion = papers[papers['Discussion / Conclusion'].isnull()]
papers = papers[~papers.ID.isin(no_discussion.ID)] #remove papers without discussion/conclusion

In [None]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Check journals

In [None]:
papers['Journal_Name'] = papers['Journal'].fillna(papers['Source_Title'])
#first, journal names from retracted are in the column Journal, from non-retracted in Source_Titel
#this code puts journal names from all groups in the column Journal_Name

In [None]:
papers['Journal_Name']  = papers['Journal_Name'].str.lower() #lowercase journal names

In [None]:
set(papers['Journal_Name']) #print all journal names

In [None]:
len(set(papers['Journal_Name']))

In [None]:
papers['Journal_Name'].value_counts() #journal names and number of papers

Removal journals that do not contain both groups


In [None]:
papers_per_journal_class = papers.groupby(['Journal_Name', 'Retracted']).size()

In [None]:
papers_per_journal_class = papers_per_journal_class.to_frame()

In [None]:
papers_per_journal_class = papers_per_journal_class.unstack(level='Retracted')

In [None]:
papers_per_journal_class #check how many papers a journal contains from each class

In [None]:
na_journals_df = papers_per_journal_class[papers_per_journal_class.isna().any(axis=1)]
na_journals_df #journals which contain no papers for one of the classes

In [None]:
na_journals = ['international journal of electrical engineering education', 'journal of clinical anesthesia']

In [None]:
papers = papers[~papers.Journal_Name.isin(na_journals)]
# remove journals which contain no papers for one of the classes

Check size of groups

In [None]:
len(papers) #size of dataset

In [None]:
len(papers[papers['Retracted'] == 0]) #size of non-retracted class

In [None]:
len(papers[papers['Retracted'] == 1]) #size of retracted class

In [None]:
len(set(papers.Journal_Name)) #number of journals

Define the important columns

In [None]:
papers.columns

In [None]:
papers = papers[['ID','All content', 'Text', 'Journal_Name','Retracted']]

Investigate the final dataframe

In [None]:
papers['ID'].iloc[0:10]

In [None]:
papers['Text'].iloc[0:7]

Save the output to CSV

In [None]:
papers.to_csv('Data (CSV)/preprocessing_part_1_common.csv')