## Pre-processing using spaCy and regex, splitting data into train/test data and external validation data

Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/3group'

Import Libraries

In [None]:
import pandas as pd
import re

Load csv file

In [None]:
papers = pd.read_csv('Data (CSV)/preprocessing_part_1.csv', encoding="utf-8-sig")

In [None]:
papers.Journal_Name.tail(5)

#### Preprocessing

Retracted word removal

In [None]:
papers.Text = papers.Text.str.replace('RETRACTION|R E T R A C T I O N|Retraction|retraction|Retracted|retracted|RETRACTED|R E T R A C T E D|Retract|retract|WITHDRAWN|W I T H D R A W N|Withdrawn|withdrawn|WITHDRAW|W I T H D R A W|Withdraw|withdraw', '', regex = True)

Number removal

In [None]:
papers.Text = papers.Text.str.replace(r'\d', '', regex = True)

Proper noun, Space, Stop word, Punctuation removal + lowercasing & lemmatization

In [None]:
import spacy 
nlp = spacy.load("en_core_web_sm") #english
texts = list(papers.Text.values)
processed_texts = [text for text in nlp.pipe(texts, disable=["ner", "parser"])] #setting up pipe for preprocessing

In [None]:
preprocessed_texts = [[word.lemma_.lower() for word in processed_text if not word.is_stop and not word.is_punct and not word.pos_ == "PROPN" and not word.is_space] 
                      for processed_text in processed_texts]

In [None]:
papers['preprocessed'] = preprocessed_texts #adding columns with preprocessed words to df

In [None]:
papers['preprocessed'].head(2)

In [None]:
import re
papers = papers.astype({"preprocessed": str}, errors='raise') #converting to string
papers['preprocessed'] = papers['preprocessed'].replace({',': ''}, regex = True) #clearing column so only preprocessed words remain
papers['preprocessed'] = papers['preprocessed'].replace({'\'': ''}, regex = True)
papers['preprocessed'] = papers['preprocessed'].replace({'\[': ''}, regex = True)
papers['preprocessed'] = papers['preprocessed'].replace({'\]': ''}, regex = True)

In [None]:
papers['preprocessed'].head(2)

In [None]:
papers.head(2)

In [None]:
papers.columns

#### Check group sizes

In [None]:
len(papers) #number of articles

In [None]:
len(papers[papers['class'] == 'nr']) #number of non-retracted articles

In [None]:
len(papers[papers['class'] == 'e']) #number of articles that are retracted due to error

In [None]:
len(papers[papers['class'] == 'm']) #number of articles that are retracted due to misconduct

In [None]:
papers['Journal_Name'].value_counts(ascending = False) #checking how much articles belong to each journal

#### Creating datasets for training/testing and external validation

Note: so we can also see if classifiers are accurate for distinguishing on retraction for journals they are not trained on

In [None]:
test_journals = ['plos one', 'molecular medicine reports'] #journals for external validation

In [None]:
journal_test_data_set = papers[papers.Journal_Name.isin(test_journals)] #external validation dataset
journal_train_data_set = papers[~papers.Journal_Name.isin(test_journals)] #train/test dataset

In [None]:
len(journal_test_data_set)

In [None]:
len(journal_train_data_set)

In [None]:
journal_test_data_set.groupby(['Journal_Name', 'class', ]).size() #check groups per journal for external validation dataset

In [None]:
classes = journal_test_data_set.groupby(['class']).count().reset_index() #group size for external validation dataset
classes

In [None]:
journal_train_data_set.groupby(['Journal_Name', 'class', ]).size() #check groups per journal for train/test dataset

In [None]:
classes = journal_train_data_set.groupby(['class']).count().reset_index() #group size for train/test dataset
classes

In [None]:
len(set(journal_train_data_set.Journal_Name)) #amount of journal for train/test dataset

In [None]:
set(journal_train_data_set.Journal_Name) #journal names for train/test dataset

Changing journal names, removing punctuation

In [None]:
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('acs applied materials & interfaces', 'acs applied materials interfaces', regex = False)
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('evidence-based complementary and alternative medicine', 'evidencebased complementary and alternative medicine', regex = False)
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('naunyn-schmiedebergs archives of pharmacology', 'naunynschmiedebergs archives of pharmacology', regex = False)
train_dataset['Journal_Name'] = train_dataset['Journal_Name'].str.replace('industrial & engineering chemistry research', 'industrial engineering chemistry research', regex = False)

Save datasets

In [None]:
journal_test_data_set.to_csv('Data (CSV)/journal_test_data_set.csv') 
journal_train_data_set.to_csv('Data (CSV)/journal_train_data_set.csv')