Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/Thesis_UU/3group'

/content/drive/MyDrive/Thesis_UU


Import libraries

In [None]:
import pandas as pd
from collections import Counter

Load Retraction Watch database

In [None]:
retracted_db = pd.read_excel('Data (Excel)/Retracted_Database.xlsx')

In [None]:
retracted_db.rename(columns={"Record ID": "Record_ID"}, inplace = True)

In [None]:
retracted_db.Reason.head(5)

0         +Author Unresponsive;+Plagiarism of Article;
1    +Concerns/Issues About Data;+Error in Image;+I...
2                          +Objections by Third Party;
3    +Concerns/Issues About Data;+Concerns/Issues A...
4    +Concerns/Issues About Data;+Concerns/Issues A...
Name: Reason, dtype: object

In [None]:
len(retracted_db)

33210

In [None]:
retracted_db.Reason = retracted_db.Reason.str.lower() #lowercase reasons for retraction

In [None]:
retracted_db.Reason[1] #check data

'+concerns/issues about data;+error in image;+investigation by journal/publisher;+objections by third party;+unreliable results;'

In [None]:
retracted_db.Reason[12]

'+falsification/fabrication of data;+investigation by company/institution;+misconduct - official investigation/finding;+upgrade/update of prior notice;'

In [None]:
retracted_db[retracted_db.Reason.str.contains('error|duplication of', na=False)].head(2) #check data

Unnamed: 0,Record_ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,RetractionDate,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes
1,36511,HO-1 overexpression alleviates senescence by i...,(BLS) Biology - Cancer;(BLS) Biology - Cellula...,"Department of Orthopaedic Surgery, The First A...",Journal of Cellular Physiology,Wiley,China,Weiwei Yi;Haiyang Lan;Yafeng Wen;Yiyang Wang;D...,,Research Article;,2022-04-14 00:00:00,10.1002/jcp.30749,35419813.0,2020-04-02 00:00:00,10.1002/jcp.29684,32239675.0,Retraction,+concerns/issues about data;+error in image;+i...,No,
9,36489,TNF-Î±â€“TNFR signal pathway inhibits autophag...,(BLS) Biology - Cellular;(BLS) Toxicology;(HSC...,"College of Public Health, North China Universi...",Journal of Cellular Physiology,Wiley,China,Qingzeng Qian;Xiangke Cao;Bin Wang;Yi Qu;Qingq...,,Research Article;,2022-04-13 00:00:00,10.1002/jcp.30737,35416280.0,2018-11-23 00:00:00,10.1002/jcp.27061,30467847.0,Retraction,+concerns/issues about image;+duplication of i...,No,see also: https://pubpeer.com/publications/CFD...


In [None]:
retracted_db.Reason = retracted_db.Reason.str.replace(';', ',', regex = True) #clean up colun 'Reason'

In [None]:
retracted_db.Reason = retracted_db.Reason.str.replace('+', '', regex = True) #clean up colun 'Reason'

Splitting up reasons for retraction, as most papers have more than one reason for retraction listed

In [None]:
retracted_db.Reason = retracted_db.Reason.map(lambda x: str(x)[:-1]) 

In [None]:
results = set()
retracted_db['Reason'].str.split(',').apply(results.update)
print(results)

{'complaints about third party', 'duplication of image', 'not presented at conference', 'rogue editor', 'notice - limited or no information', 'withdrawn (out of date)', 'concerns/issues about image', 'error in methods', 'miscommunication by third party', 'error in data', 'investigation by third party', 'ethical violations by third party', 'error in text', 'informed/patient consent - none/withdrawn', 'complaints about author', 'falsification/fabrication of data', 'objections by third party', 'plagiarism of data', 'concerns/issues about authorship', 'doing the right thing', 'error in cell lines/tissues', 'misconduct by author', 'error by journal/publisher', 'civil proceedings', 'taken from dissertation/thesis', 'notice - no/limited information', 'concerns/issues about results', 'contamination of materials (general)', 'notice - unable to access via current resources', 'retract and replace', 'results not reproducible', 'falsification/fabrication of results', 'contamination of reagents', 'd

In [None]:
results #all reasons for retraction

{'author unresponsive',
 'bias issues or lack of balance',
 'breach of policy by author',
 'breach of policy by third party',
 'cites retracted work',
 'civil proceedings',
 'complaints about author',
 'complaints about company/institution',
 'complaints about third party',
 'concerns/issues about authorship',
 'concerns/issues about data',
 'concerns/issues about image',
 'concerns/issues about referencing/attributions',
 'concerns/issues about results',
 'concerns/issues about third party involvement',
 'conflict of interest',
 'contamination of cell lines/tissues',
 'contamination of materials (general)',
 'contamination of reagents',
 'copyright claims',
 'criminal proceedings',
 'date of retraction/other unknown',
 'doing the right thing',
 'duplicate publication through error by journal/publisher',
 'duplication of article',
 'duplication of data',
 'duplication of image',
 'duplication of text',
 'error by journal/publisher',
 'error by third party',
 'error in analyses',
 'erro

Check how often reasons appear in the database

In [None]:
results = Counter()
retracted_db['Reason'].str.lower().str.split(',').apply(results.update)
print(results)
#counts number of appearance for every reason

Counter({'notice - limited or no information': 9644, 'investigation by journal/publisher': 6046, 'withdrawal': 3165, 'breach of policy by author': 2791, 'duplication of article': 2782, 'investigation by company/institution': 2316, 'concerns/issues about data': 2315, 'euphemisms for plagiarism': 2071, 'duplication of image': 2060, 'investigation by third party': 1980, 'date of retraction/other unknown': 1973, 'unreliable results': 1931, 'plagiarism of article': 1901, 'plagiarism of text': 1883, 'fake peer review': 1839, 'misconduct by author': 1609, 'notice - no/limited information': 1551, 'error in data': 1439, 'falsification/fabrication of data': 1433, 'misconduct - official investigation/finding': 1295, 'error in results and/or conclusions': 1136, 'paper mill': 1069, 'error in analyses': 976, 'concerns/issues about referencing/attributions': 940, 'upgrade/update of prior notice': 907, 'randomly generated content': 903, 'error in methods': 868, 'manipulation of images': 845, 'retract 

In [None]:
retracted_db['Reason'].apply(lambda x: pd.value_counts(x.split(','))).sum(axis = 0).sort_values(ascending = False).head(50)
#lists 30 most frequent reasons and their appearance

notice - limited or no information                          9644.0
investigation by journal/publisher                          6046.0
withdrawal                                                  3165.0
breach of policy by author                                  2791.0
duplication of article                                      2782.0
investigation by company/institution                        2316.0
concerns/issues about data                                  2315.0
euphemisms for plagiarism                                   2071.0
duplication of image                                        2060.0
investigation by third party                                1980.0
date of retraction/other unknown                            1973.0
unreliable results                                          1931.0
plagiarism of article                                       1901.0
plagiarism of text                                          1883.0
fake peer review                                            18

In [None]:
retracted_db.Reason.tail(10)

33200      notice - unable to access via current resources
33201                             results not reproducible
33202    error in data,error in results and/or conclusions
33203    falsification/fabrication of data,plagiarism o...
33204    error in results and/or conclusions,results no...
33205    error in results and/or conclusions,unreliable...
33206                                     copyright claims
33207                                     copyright claims
33208                                    error in analyses
33209                                        error in text
Name: Reason, dtype: object

## Create the 2 groups: error and misconduct

In [None]:
error = retracted_db[retracted_db.Reason.str.contains('error', na=False)]
#subset with every paper that contains a reason which includes the term 'error'

In [None]:
len(error) #size of subset

4400

In [None]:
error.Reason.head(5)

1     concerns/issues about data,error in image,inve...
24    duplication of image,error in results and/or c...
40    error in data,error in methods,unreliable results
42    error in analyses,error in methods,retract and...
44    error in analyses,error in methods,error in re...
Name: Reason, dtype: object

In [None]:
error['Reason'].apply(lambda x: pd.value_counts(x.split(','))).sum(axis = 0).sort_values(ascending = False).head(30)
#lists of top 30 most frequent reasons for retraction in error group

error in data                                               1439.0
error in results and/or conclusions                         1136.0
error in analyses                                            976.0
error in methods                                             868.0
unreliable results                                           706.0
duplicate publication through error by journal/publisher     531.0
error in image                                               522.0
error in text                                                515.0
retract and replace                                          427.0
error by journal/publisher                                   382.0
concerns/issues about data                                   267.0
investigation by journal/publisher                           245.0
unreliable data                                              240.0
results not reproducible                                     222.0
withdrawal                                                   1

In [None]:
misconduct = retracted_db[retracted_db.Reason.str.contains('fake|false|falsification|hoax|manipulation|misconduct by|paper mill|plagiarism of|randomly generated content|sabotage|salami slicing', na=False)]
#subset with every paper that contains a reason selected for misconduct group

In [None]:
len(misconduct) #size of subset

10000

In [None]:
misconduct.Reason.head()

0             author unresponsive,plagiarism of article
12    falsification/fabrication of data,investigatio...
15    ethical violations by author,false affiliation...
21                 plagiarism of image,unreliable image
27    concerns/issues about data,duplication of imag...
Name: Reason, dtype: object

In [None]:
misconduct['Reason'].apply(lambda x: pd.value_counts(x.split(','))).sum(axis = 0).sort_values(ascending = False).head(30)
#lists of top 30 most frequent reasons for retraction in misconduct group

investigation by journal/publisher                2541.0
euphemisms for plagiarism                         1979.0
plagiarism of article                             1901.0
plagiarism of text                                1883.0
fake peer review                                  1839.0
investigation by company/institution              1783.0
investigation by third party                      1723.0
misconduct by author                              1609.0
falsification/fabrication of data                 1433.0
misconduct - official investigation/finding       1268.0
paper mill                                        1069.0
duplication of image                              1029.0
randomly generated content                         903.0
manipulation of images                             845.0
concerns/issues about data                         831.0
concerns/issues about referencing/attributions     648.0
unreliable results                                 641.0
false/forged authorship        

Remove overlapping papers / papers that contain both error and misconduct reasons

In [None]:
overlap = pd.merge(misconduct, error, how='inner', left_on='Record_ID', right_on='Record_ID')
#create subset containing papers with overlapping reasons

In [None]:
len(overlap)

384

In [None]:
overlap.head(2)

Unnamed: 0,Record_ID,Title_x,Subject_x,Institution_x,Journal_x,Publisher_x,Country_x,Author_x,URLS_x,ArticleType_x,...,RetractionDate_y,RetractionDOI_y,RetractionPubMedID_y,OriginalPaperDate_y,OriginalPaperDOI_y,OriginalPaperPubMedID_y,RetractionNature_y,Reason_y,Paywalled_y,Notes_y
0,35598,Hypoxia-induced apoptosis of cardiomyocytes is...,(BLS) Biochemistry;(BLS) Biology - Cellular;(B...,"Department of Cardiology, Shaanxi Traditional ...",Cell Cycle,Taylor and Francis,China,Dezhi Ren;Fang Li;An Gao;Qingwen Cao;Yarong Li...,,Research Article;,...,2022-02-22 00:00:00,10.1080/15384101.2021.2014708,35191821.0,2020-04-16 00:00:00,10.1080/15384101.2020.1731651,32295500.0,Retraction,"error in data,investigation by third party,pap...",No,see also: https://pubpeer.com/publications/61F...
1,34872,Oxymatrine synergistically enhances the inhibi...,(BLS) Biology - Cancer;(BLS) Biology - Cellula...,"Department of General Surgery, Wujiang No. 1 P...",Tumor Biology (Tumour Biology) - Official Jour...,IOS Press,China,Yan Liu;Tingting Bi;Wei Dai;Gang Wang;Liqiang ...,,Research Article;,...,2021-12-21 00:00:00,10.3233/TUB-219010,34957978.0,2015-12-18 00:00:00,10.1007/s13277-015-4642-1,26687645.0,Retraction,"concerns/issues about data,duplication of imag...",No,see also: https://pubpeer.com/publications/0D9...


In [None]:
overlap.Reason_x.head(3)

0    error in data,investigation by third party,pap...
1    concerns/issues about data,duplication of imag...
2    error in data,falsification/fabrication of dat...
Name: Reason_x, dtype: object

In [None]:
error = error[~error.Record_ID.isin(overlap.Record_ID)] #removal of overlap from error subset

In [None]:
len(error) #size of error group

4016

In [None]:
misconduct = misconduct[~misconduct.Record_ID.isin(overlap.Record_ID)] #removal of overlap from misconduct subset

In [None]:
len(misconduct) #size of misconduct group

9616

In [None]:
error['misconduct'] = 0 #create column indicating group

In [None]:
error.head(2)

Unnamed: 0,Record_ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,...,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes,misconduct
1,36511,HO-1 overexpression alleviates senescence by i...,(BLS) Biology - Cancer;(BLS) Biology - Cellula...,"Department of Orthopaedic Surgery, The First A...",Journal of Cellular Physiology,Wiley,China,Weiwei Yi;Haiyang Lan;Yafeng Wen;Yiyang Wang;D...,,Research Article;,...,10.1002/jcp.30749,35419813.0,2020-04-02 00:00:00,10.1002/jcp.29684,32239675.0,Retraction,"concerns/issues about data,error in image,inve...",No,,0
24,36457,HMGB1 Facilitated Macrophage Reprogramming tow...,(BLS) Biochemistry;(BLS) Biology - Cellular;,"Department of Immunology, Jiangsu University, ...",Scientific Reports,Springer - Nature Publishing Group,China,Zhaoliang Su;Pan Zhang;Ying Yu;Hongxiang Lu;Ya...,,Research Article;,...,10.1038/s41598-022-10210-2,35396497.0,2016-02-22 00:00:00,10.1038/srep21884,26899795.0,Retraction,"duplication of image,error in results and/or c...",No,see also: https://pubpeer.com/publications/7E1...,0


In [None]:
misconduct['misconduct'] = 1 #create column indicating group

In [None]:
misconduct.head(2)

Unnamed: 0,Record_ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,...,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes,misconduct
0,36468,CSTM: Cluster-based Security Trust Mechanism f...,(B/T) Computer Science;(B/T) Transportation;(B...,"Computer and Science College, National Univers...",IOP Conference Series: Materials Science and E...,IOP Publishing,China,Jijin Wang;Xiaoqiang Xiao;Lu Peng,,Conference Abstract/Paper;,...,10.1088/1757-899X/688/4/044077,0.0,2019-12-06 00:00:00,10.1088/1757-899X/688/4/044051,0.0,Retraction,"author unresponsive,plagiarism of article",No,,1
12,36485,5-Aza-Deoxycytidine Induces Selective Degradat...,(BLS) Anatomy/Physiology;(BLS) Biology - Cellu...,Department of Molecular and Cellular Biochemis...,Molecular and Cellular Biology,American Society for Microbiology,United States,Kalpana Ghoshal;Jharna Datta;Sarmila Majumder;...,http://retractionwatch.com/2018/08/17/cancer-r...,Research Article;,...,10.1128/mcb.00546-21,0.0,2005-06-01 00:00:00,10.1128/MCB.25.11.4727-4741.2005,15899874.0,Retraction,"falsification/fabrication of data,investigatio...",No,,1


Save files

In [None]:
error.to_csv('Data (CSV)/error_db.csv')

In [None]:
misconduct.to_csv('Data (CSV)/misconduct_db.csv')