## Pre-processing using spaCy and regex, splitting data into train/test data and external validation data

Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/3group'

/content/drive/MyDrive/Thesis_UU/Common_20_05_2022/Code


Import Libraries

In [None]:
import pandas as pd
import re

Load csv file

In [None]:
papers = pd.read_csv('Data (CSV)/preprocessing_part_1.csv', encoding="utf-8-sig")

In [None]:
papers.Journal_Name.tail(5)

1479                                 plos one
1480      construction and building materials
1481      construction and building materials
1482    experimental and therapeutic medicine
1483                                 plos one
Name: Journal_Name, dtype: object

#### Preprocessing

Retracted word removal

In [None]:
papers.Text = papers.Text.str.replace('RETRACTION|R E T R A C T I O N|Retraction|retraction|Retracted|retracted|RETRACTED|R E T R A C T E D|Retract|retract|WITHDRAWN|W I T H D R A W N|Withdrawn|withdrawn|WITHDRAW|W I T H D R A W|Withdraw|withdraw', '', regex = True)

Number removal

In [None]:
papers.Text = papers.Text.str.replace(r'\d', '', regex = True)

Proper noun, Space, Stop word, Punctuation removal + lowercasing & lemmatization

In [None]:
import spacy 
nlp = spacy.load("en_core_web_sm") #english
texts = list(papers.Text.values)
processed_texts = [text for text in nlp.pipe(texts, disable=["ner", "parser"])] #setting up pipe for preprocessing

In [None]:
preprocessed_texts = [[word.lemma_.lower() for word in processed_text if not word.is_stop and not word.is_punct and not word.pos_ == "PROPN" and not word.is_space] 
                      for processed_text in processed_texts]

In [None]:
papers['preprocessed'] = preprocessed_texts #adding columns with preprocessed words to df

In [None]:
papers['preprocessed'].head(2)

0    [microsatellite, analysis, embryo, egg, case, ...
1    [evidence, pollinate, selective, pressure, imp...
Name: preprocessed, dtype: object

In [None]:
import re
papers = papers.astype({"preprocessed": str}, errors='raise') #converting to string
papers['preprocessed'] = papers['preprocessed'].replace({',': ''}, regex = True) #clearing column so only preprocessed words remain
papers['preprocessed'] = papers['preprocessed'].replace({'\'': ''}, regex = True)
papers['preprocessed'] = papers['preprocessed'].replace({'\[': ''}, regex = True)
papers['preprocessed'] = papers['preprocessed'].replace({'\]': ''}, regex = True)

In [None]:
papers['preprocessed'].head(2)

0    microsatellite analysis embryo egg case ovipar...
1    evidence pollinate selective pressure impose m...
Name: preprocessed, dtype: object

In [None]:
papers.head(2)

Unnamed: 0.1,Unnamed: 0,ID,All content,Text,Journal_Name,class,Retracted,preprocessed
0,10,nr4472,RESEARCH ARTICLE Twins! Microsatellite analysi...,RESEARCH ARTICLE Twins! Microsatellite analysi...,plos one,nr,0,microsatellite analysis embryo egg case ovipar...
1,11,nr4473,Is There ‘Anther-Anther Interference’ within a...,Is There ‘Anther-Anther Interference’ within a...,plos one,nr,0,evidence pollinate selective pressure impose m...


In [None]:
papers.columns

Index(['Unnamed: 0', 'ID', 'All content', 'Text', 'Journal_Name', 'class',
       'Retracted', 'preprocessed'],
      dtype='object')

#### Check group sizes

In [None]:
len(papers) #number of articles

1484

In [None]:
len(papers[papers['class'] == 'nr']) #number of non-retracted articles

941

In [None]:
len(papers[papers['class'] == 'e']) #number of articles that are retracted due to error

158

In [None]:
len(papers[papers['class'] == 'm']) #number of articles that are retracted due to misconduct

385

In [None]:
papers['Journal_Name'].value_counts(ascending = False) #checking how much articles belong to each journal

plos one                                                 252
molecular medicine reports                               115
journal of cellular biochemistry                         110
tumor biology                                             95
experimental and therapeutic medicine                     74
cancer research                                           74
artificial cells nanomedicine and biotechnology           56
cell                                                      51
construction and building materials                       41
acs applied materials & interfaces                        40
journal of cellular physiology                            40
cell metabolism                                           32
life sciences                                             32
mathematical problems in engineering                      28
cancer letters                                            28
canadian journal of physics                               26
rsc advances            

#### Creating datasets for training/testing and external validation

Note: so we can also see if classifiers are accurate for distinguishing on retraction for journals they are not trained on

In [None]:
test_journals = ['plos one', 'molecular medicine reports'] #journals for external validation

In [None]:
journal_test_data_set = papers[papers.Journal_Name.isin(test_journals)] #external validation dataset
journal_train_data_set = papers[~papers.Journal_Name.isin(test_journals)] #train/test dataset

In [None]:
len(journal_test_data_set)

367

In [None]:
len(journal_train_data_set)

1117

In [None]:
journal_test_data_set.groupby(['Journal_Name', 'class', ]).size() #check groups per journal for external validation dataset

Journal_Name                class
molecular medicine reports  e          5
                            m         18
                            nr        92
plos one                    e         34
                            m         61
                            nr       157
dtype: int64

In [None]:
classes = journal_test_data_set.groupby(['class']).count().reset_index() #group size for external validation dataset
classes

Unnamed: 0.2,class,Unnamed: 0,Unnamed: 0.1,ID,All content,Text,Journal_Name,Retracted,preprocessed
0,e,39,39,39,39,39,39,39,39
1,m,79,79,79,79,79,79,79,79
2,nr,249,249,249,249,249,249,249,249


In [None]:
journal_train_data_set.groupby(['Journal_Name', 'class', ]).size() #check groups per journal for train/test dataset

Journal_Name                                     class
acs applied materials & interfaces               e         2
                                                 m         7
                                                 nr       31
artificial cells nanomedicine and biotechnology  e         2
                                                 m        12
                                                          ..
thin solid films                                 m         2
                                                 nr        9
tumor biology                                    e         2
                                                 m        19
                                                 nr       74
Length: 120, dtype: int64

In [None]:
classes = journal_train_data_set.groupby(['class']).count().reset_index() #group size for train/test dataset
classes

Unnamed: 0.2,class,Unnamed: 0,Unnamed: 0.1,ID,All content,Text,Journal_Name,Retracted,preprocessed
0,e,119,119,119,119,119,119,119,119
1,m,306,306,306,306,306,306,306,306
2,nr,692,692,692,692,692,692,692,692


In [None]:
len(set(journal_train_data_set.Journal_Name)) #amount of journal for train/test dataset

40

In [None]:
set(journal_train_data_set.Journal_Name) #journal names for train/test dataset

{'acs applied materials & interfaces',
 'artificial cells nanomedicine and biotechnology',
 'biochemical pharmacology',
 'biomed research international',
 'blood',
 'brain research',
 'canadian journal of physics',
 'cancer gene therapy',
 'cancer letters',
 'cancer research',
 'cell',
 'cell cycle',
 'cell metabolism',
 'construction and building materials',
 'embo journal',
 'evidence-based complementary and alternative medicine',
 'experimental and therapeutic medicine',
 'experimental cell research',
 'industrial & engineering chemistry research',
 'international immunopharmacology',
 'journal of biological chemistry',
 'journal of bone and mineral research',
 'journal of cell science',
 'journal of cellular biochemistry',
 'journal of cellular physiology',
 'journal of controlled release',
 'journal of neuroscience',
 'journal of the american chemical society',
 'lancet',
 'life sciences',
 'materials science and engineering',
 'mathematical problems in engineering',
 'medicine',


Changing journal names, removing punctuation

In [None]:
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('acs applied materials & interfaces', 'acs applied materials interfaces', regex = False)
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('evidence-based complementary and alternative medicine', 'evidencebased complementary and alternative medicine', regex = False)
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('naunyn-schmiedebergs archives of pharmacology', 'naunynschmiedebergs archives of pharmacology', regex = False)
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('industrial & engineering chemistry research', 'industrial engineering chemistry research', regex = False)

Save datasets

In [None]:
journal_test_data_set.to_csv('Data (CSV)/journal_test_data_set.csv') 
journal_train_data_set.to_csv('Data (CSV)/journal_train_data_set.csv')