## Import

In [1]:
!pip3 install nltk
!pip3 install scipy

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

import random
random.seed(101)


The numbers presented below are the percentage of included paper in each dataset.

In [35]:
n_papers = [3453, 1993, 5746, 2544, 851, 310, 1120, 2072, 1218, 368, 393, 1915, 503, 1333, 1643, 3465, 671, 327, 8911, 1704, 2481, 2019, 6000, 7002, 5019, 10953, 6189]
n_included = [29, 280, 11, 41, 20, 16, 146, 42, 100, 80, 41, 15, 136, 51, 9, 85, 24, 40, 104, 45, 120, 101, 48, 62, 19, 73, 43]
class_ratio = np.round([inc/papers*100 for inc, papers in zip(n_included, n_papers)], decimals=2)
print(class_ratio)

[ 0.84 14.05  0.19  1.61  2.35  5.16 13.04  2.03  8.21 21.74 10.43  0.78
 27.04  3.83  0.55  2.45  3.58 12.23  1.17  2.64  4.84  5.    0.8   0.89
  0.38  0.67  0.69]


We will test out the different ranges of ratios with the different classifiers. We will select 27, 21, 14, 12, 10, 8, 5, 2 and 1. 

In [None]:
datasets = {'14': pd.read_csv('../../dataset/Bannach-Brown_2019.csv'),
            '5': pd.read_csv('../../dataset/Antihistamines.csv'),
            '2': pd.read_csv('../../dataset/BetaBlockers.csv'),
            '8': pd.read_csv('../../dataset/CalciumChannelBlockers.csv'),
            '21': pd.read_csv('../../dataset/Estrogens.csv'),
            '10': pd.read_csv('../../dataset/NSAIDS.csv'),
            '27': pd.read_csv('../../dataset/OralHypoglycemics.csv'),
            '12': pd.read_csv('../../dataset/UrinaryIncontinence.csv'),
            '1': pd.read_csv('../../dataset/Hall_2012.csv')
            }

## Code

### Prepare the dataset

In [4]:
df = pd.read_csv('../../dataset/Appenzeller-Herzog_2020.csv')
print(df.iloc[0])

record_id                                                                   1
title                       Binding Selectivity of Methanobactin from Meth...
abstract                    Methanobactin (Mb) from Methylosinus trichospo...
keywords                                                                  NaN
authors                                McCabe, J. W.;Vangala, R.;Angel, L. A.
year                                                                     2017
date                                                                   Aug 30
doi                              https://dx.doi.org/10.1007/s13361-017-1778-9
label_included                                                              0
label_abstract_screening                                                    0
duplicate_record_id                                                       NaN
Name: 0, dtype: object


Fill the missing abstracts with just an empty string.

In [5]:
df['abstract'] = df['abstract'].fillna('')
df['title'] = df['title'].fillna('')

### Preprocess text

In [6]:
nltk.download('all')
nltk.download('stopwords')
STOP_WORDS = stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/simonl/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/simonl/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/simonl/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/simonl/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /home/simonl/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to
[nltk_data]    |

In [7]:
def preprocess_text(title, abstract):
    # Combine title and abstract
    text = title + ' ' + abstract
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    filtered_words = [word for word in tokens if word.lower() not in STOP_WORDS]
    stemmed_words = [stemmer.stem(word) for word in filtered_words]

    processed_text = ' '.join(stemmed_words)
    return processed_text
    

In [8]:
df['processed_text'] = df.apply(lambda x: preprocess_text(x['title'], x['abstract']), axis=1)

Convert text data into numerical features. TF-IDF takes into account both the frequency of words in each document and the frequency of words across the entire corpus. It assigns a higher weight to words that are more important for a given document, based on how frequently they appear in that document and how rare they are across the entire corpus.

In [9]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])

In [10]:
X.shape

(3453, 13637)

### Train Classifier

Split the dataset into training and testing.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, df['label_included'], test_size=0.2, random_state=101)

**Train a Naive Bayes Classifier**

In [14]:
NB_clf = MultinomialNB(class_prior=None, alpha=1.0, fit_prior=True)
NB_clf.fit(X_train, y_train)

**Logistic Regression**

In [15]:
Log_clf = LogisticRegression(random_state=101, class_weight='balanced').fit(X_train, y_train)

**SVM**

In [16]:
SVM_clf = SVC(kernel='linear', C=1.0)
SVM_clf.fit(X_train, y_train)

### Evaluate the classifier

In [18]:
NB_accuracy = NB_clf.score(X_test, y_test)
Log_accuracy = Log_clf.score(X_test, y_test)
SVM_accuracy = SVM_clf.score(X_test, y_test)
print("Test set accuracy for NB: {:.2f}%".format(NB_accuracy * 100))
print("Test set accuracy for Logistic Regression: {:.2f}%".format(Log_accuracy * 100))
print("Test set accuracy for SVM: {:.2f}%".format(SVM_accuracy * 100))

Test set accuracy for NB: 98.84%
Test set accuracy for Logistic Regression: 97.97%
Test set accuracy for SVM: 98.70%


-----------------------------------------------------------------------------------------------------

## Iterative testing

In [19]:
perc_screen = np.round(np.arange(0.1, 1.1, 0.1) * X_train.shape[0]).astype(int)
accuracy = []

In [20]:
X_train.shape

(2762, 13637)

In [21]:
for amount in perc_screen:
    SVM_clf = SVC(kernel='linear', C=1.0)
    SVM_clf.fit(X_train[:amount], y_train[:amount])
    accuracy.append(Log_clf.score(X_test, y_test) * 100)

In [22]:
accuracy

[97.9739507959479,
 97.9739507959479,
 97.9739507959479,
 97.9739507959479,
 97.9739507959479,
 97.9739507959479,
 97.9739507959479,
 97.9739507959479,
 97.9739507959479,
 97.9739507959479]